# -*- coding: utf-8 -*-
"""
Created on Wed Feb 28 18:10:36 2024

@author: STEFAN
"""

#Import all libraries
import pandas as pd
import re
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#import nltk
#from nltk.stem import PorterStemmer
#from gensim import corpora, models
#import numpy as np
import matplotlib.colors as mcolors
import random
import calendar
import string
import streamlit as st

def WhatsappAnalysis(df,nombregrupo,savefigs=False,directory=''):
#file = "WhatsApp Chat with Test Group.txt"
#If you do not want to save figures, just comment the directory line and change savefigs to False
#directory = "/Users/selbl/Downloads//"
#savefigs = False
    #Get stop words for filtering
    #stopwords = nltk.corpus.stopwords.words('english')
    stop_words = set(stopwords.words('english'))
    #stop_words = nltk.corpus.stopwords.words('english')
    #Get special characters for filtering
    #Create dictionary with the items to be deleted
    del_dict = {sp_character: '' for sp_character in string.punctuation}
    #Add the space character
    del_dict[' '] = ''
    #Create reference table
    table = str.maketrans(del_dict)
    #Create filtered database
    dfFilter = df
    #Obtain length of the filtered database (important for later)
    largo = len(dfFilter)
    #Iterate over sample to eliminate stop words
    for i in range(largo):
        #Extract message
        message = df.at[i,"Message"]
        #Tokenize
        word_tokens = word_tokenize(message)
        #Filter stop words
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        #This filter still returns certain punctation and special characters which we do not care about
        #Hence why I get rid of them    
        filtered_sentence = [val.translate(table) for val in filtered_sentence]
        #Create coherent message
        fd = ""
        for x in filtered_sentence:
            fd = fd + x + " "
        #Remove final space
        fd = fd[:-1]
        #Add in database
        dfFilter.at[i,"Message"] = fd
        
    dfFilter['Sender'] = dfFilter['Sender'].str.split().str[0]
    
    Senders = list(set([sender for sender in dfFilter['Sender']]))
    
    colours = list(mcolors.CSS4_COLORS.keys())
    transparent_colours = ['lightgray','lightgrey','whitesmoke','white','snow','mistyrose','lightcoral','seashell',
                          'peachpuff','linen','bisque','antiquewhite','papayawhip','bisque','lemonchiffon','beige','blanchealmond','oldlace',
                         'floralwhite','cornsilk','ivory','lightyellow','lightgoldenrodyellow','honeydew',
                         'mintcream','azure','lightcyan','lightskyblue','aliceblue','lightsteelblue','ghostwhite',
                         'lavender','lavenderblush','lightpink']
    #eliminate them from consideration
    colours = [x for x in colours if x not in transparent_colours]
    
    def random_color_generator(colors):
        color = random.choice(colors)
        return color
    
    
    #Create the color scheme
    #Get number of senders
    num_senders = len(Senders)
    #Initialize the empty list that stores the colours
    colorscheme = random.sample(colours, num_senders)
    #Store colorscheme as a dictionary as well. This is to make plotting easier
    colour_dict = {}
    conti = 0
    for sender in Senders:
        colour_dict[sender] = colorscheme[conti]
        conti = conti + 1
        
    #Obtain the first year by looking at the first message, same with first month
    splitline_date = re.split('/',dfFilter.iloc[0]['Date'])
    anio = 2000 + int(splitline_date[2])
    mes = calendar.month_name[int(splitline_date[0])]
    
    #Get a graph of different messages by sender
    #Get the number of messages
    NumMessages = [sum(dfFilter['Sender'] == sender) for sender in Senders]
    #Generate the plot
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.set_ylabel('Messages')
    ax.set_xlabel('Sender')
    ax.set_title(f'Number of Messages in {nombregrupo} by sender since {mes} of {anio}')
    ax.bar(Senders,NumMessages,color=colorscheme)
    if savefigs:
        plt.savefig(directory[:-1] + "Global Messages.png")
    st.pyplot(fig)
    
    #Yearly segmentation
    #Obtain the evolution of messages per year
    #Define years
    #obtain last year from og year
    c = dfFilter['Date'].str.extract('.*(\d{2})', expand = False)
    last_year = 2000 + int(c.iloc[-1])
    #Obtain range of years
    diff_year = last_year - anio + 1
    
    #Obtain distinct last digit of years
    years = [anio + x for x in range(0,diff_year)]
    #Define dictionary
    vals2 = {}
    #Obtain counts
    for year in years:
        vals2[year] = {}
        #subset dataframealse)
        cont = Counter(dfFilter[dfFilter['Date'].str.extract('.*(\d{2})', expand = False) == str(year - 2000)]['Sender']).most_common(len(Senders))
        #Check if somebody is missing
        #Check if length is different
        if len(cont) < len(Senders):
            #If so, find which ones are not in the count
            PresentSenders = [cont[i][0] for i in range(len(cont))]
            for sender in Senders:
                if sender not in PresentSenders:
                    cont.append((sender,0))
        #Obtain return list
        for i in range(len(cont)):
            #Get sender
            send = cont[i][0]
            #Get number of messages
            nro = cont[i][1]
            #Add to dictionary
            vals2[year][send] = nro
        
    #Plot the evolution
    fig = plt.figure()
    for sender in Senders:
        #Get messages:
        messages = [vals2[year][sender] for year in years]
        plt.plot(years,messages, label=sender, color = colour_dict[sender])    
        
    fig.suptitle('Evolution of Number of Messages Sent per Year', fontsize=12)
    plt.xticks(range(anio,last_year+1))
    plt.xlabel('Year', fontsize=12)
    plt.ylabel('Messages', fontsize=12)
    plt.legend(loc="best")
    if savefigs:
        plt.savefig(directory[:-1]+ "Evolution Messages.png")
    st.pyplot(fig) 
    
    #Global Word Cloud
    
    all_words = '' 
    
    #looping through all incidents and joining them to one text, to extract most common words
    for arg in dfFilter['Message']: 
        tokens = arg.split()  
        #Remove media and omitted if they exist
        if ('Media' in tokens):
            tokens.remove('Media')
        if ('omitted' in tokens):
            tokens.remove('omitted')
        '''
        The tokens below appeared a lot during testing
        My guess is because they appeared when tokenizing a plural word or because they represent Whatsapp data that cannot be
        represented into text (such as a sticker). 
        
        In any case, the code works well with this
        '''
        if ('s' in tokens):
            tokens.remove('s')
        if ('m' in tokens):
            tokens.remove('m')
        #Join
        all_words += " ".join(tokens)+" "
    
    wordcloud = WordCloud(width = 700, height = 700, 
                    background_color ='white', 
                    min_font_size = 10,normalize_plurals=False).generate(all_words) 
      
    # plot the WordCloud image                        
    plt.figure(figsize = (5, 5), facecolor = None) 
    plt.title("Global Word Cloud since " + mes + " of " + str(anio))
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0)
    if savefigs:
        plt.savefig(directory[:-1] + "WordCloud Global.png")
    st.pyplot(plt.gcf())
    
    #One worldcloud per person
    #Obtain senders
    for sender in Senders:
        all_words = ''   
        #Create subdf with only their messages
        dfFilter2 = dfFilter[dfFilter['Sender'] == sender]
        #looping through all incidences and joining them to one text, to extract most common words
        for arg in dfFilter2['Message']: 
            tokens = arg.split()  
            #Remove media and omitted if they exist
            if ('Media' in tokens):
                tokens.remove('Media')
            if ('omitted' in tokens):
                tokens.remove('omitted')
            #Same comment as in the cell above
            if ('s' in tokens):
                tokens.remove('s')
            if ('m' in tokens):
                tokens.remove('m')
            
            all_words += " ".join(tokens)+" "
        
        wordcloud = WordCloud(width = 700, height = 700, 
                        background_color ='white', 
                        min_font_size = 10).generate(all_words) 
        plt.figure(figsize = (5, 5), facecolor = None) 
        plt.imshow(wordcloud) 
        plt.title(f"Word cloud for {sender} from {mes} of {anio}")
        plt.axis("off") 
        plt.tight_layout(pad = 0) 
        if savefigs:
            plt.savefig(directory[:-1] + "WordCloud for" + sender + ".png")   
        st.pyplot(plt.gcf())
        
    #Conduct sentiment analysis
    # Initialize the VADER Sentiment Intensity Analyzer
    sid = SentimentIntensityAnalyzer()
    
    # Function to get sentiment polarity
    def get_sentiment_polarity(message):
        sentiment_scores = sid.polarity_scores(message)
        return 'positive' if sentiment_scores['compound'] >= 0.05 else 'negative' if sentiment_scores['compound'] <= -0.05 else 'neutral'
    
    # Apply the sentiment analysis to the 'Message' column
    dfFilter['Sentiment'] = dfFilter['Message'].apply(get_sentiment_polarity)
    
    # Calculate the percentage of each sentiment for all messages
    sentiment_percentage_all = dfFilter['Sentiment'].value_counts(normalize=True) * 100
    
    # Calculate the percentage of each sentiment per sender
    sentiment_percentage_per_sender = dfFilter.groupby(['Sender', 'Sentiment']).size().unstack(fill_value=0)
    sentiment_percentage_per_sender = sentiment_percentage_per_sender.div(sentiment_percentage_per_sender.sum(axis=1), axis=0) * 100
    
    # Display the results
    print("Sentiment Percentage for All Messages:")
    print(sentiment_percentage_all)
    print("\nSentiment Percentage per Sender:")
    print(sentiment_percentage_per_sender)
    
    # Calculate the percentage of each sentiment per year per sender
    
    #Create year column
    dfFilter['Year'] = pd.to_datetime(dfFilter['Date']).dt.year
    
    #Get it by year
    sentiment_percentage_per_year_per_sender = dfFilter.groupby(['Year', 'Sender', 'Sentiment']).size().unstack(fill_value=0)
    sentiment_percentage_per_year_per_sender = sentiment_percentage_per_year_per_sender.div(sentiment_percentage_per_year_per_sender.sum(axis=1), axis=0) * 100
    
    # Plot the results using Matplotlib
    for sender in dfFilter['Sender'].unique():
        data = sentiment_percentage_per_year_per_sender.loc[(slice(None), sender), :]
        ax = data.unstack().plot(kind='bar', stacked=True, title=f'Sentiment Percentage per Year for {sender}')
        plt.ylabel('Percentage')
        plt.xlabel('Year')
        plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')
        if savefigs:
            plt.savefig(directory[:-1] + f"Sentiment Analysis for {sender}.png")           
        st.pyplot(plt.gcf())
        
    #Define senders to do more specific analysis
    senders = dfFilter['Sender'].unique()
    for sender in senders:
        data = sentiment_percentage_per_year_per_sender.loc[(slice(None), sender), :].unstack()
        
        plt.figure(figsize=(10, 6))
        
        for sentiment in ['positive', 'neutral', 'negative']:
            plt.plot(data[sentiment], label=sentiment.capitalize())
        
        plt.title(f'Sentiment Percentage per Year for {sender}')
        plt.xlabel('Year')
        plt.ylabel('Percentage')
        plt.xticks(list(set(dfFilter['Year'])), rotation=45)  # Set x-axis ticks to unique years
        plt.legend(title='Sentiment',loc="best")
        plt.grid(True)
        if savefigs:
            plt.savefig(directory[:-1] + f"Sentiment Percentage Line Evolution for {sender}.png")
        st.pyplot(plt.gcf())
        
    # Assuming 'Time' is in the format 'HH:MM:SS'. If not, adjust the format accordingly.
    #dfFilter['Activation_Time'] = pd.to_datetime(dfFilter['Time'], format='%H:%M').dt.hour
    dfFilter['Activation_Time'] = pd.to_datetime(dfFilter['Time'], format='mixed').dt.hour
    # Plot a histogram of message activation times
    plt.figure(figsize=(10, 6))
    plt.hist(dfFilter['Activation_Time'], bins=24, edgecolor='black', alpha=0.7)
    plt.title('Distribution of Message Activation Times')
    plt.xlabel('Activation Time (Hour of Day)')
    plt.ylabel('Number of Messages')
    plt.xticks(range(24))
    plt.grid(True)
    if savefigs:
        plt.savefig(directory[:-1] + "Activation Times.png")
    st.pyplot(plt.gcf())
    
    # Plot a histogram of message activation times
    plt.figure(figsize=(10, 6))
    for sender in senders:
        plt.hist(dfFilter[dfFilter['Sender'] == sender]['Activation_Time'], bins=24, edgecolor='black',color = colour_dict[sender], alpha=0.7,label = sender)
    plt.title('Distribution of Message Activation Times')
    plt.xlabel('Activation Time (Hour of Day)')
    plt.ylabel('Number of Messages')
    plt.xticks(range(24))
    plt.legend(loc='upper left')
    plt.grid(True)
    if savefigs:
        plt.savefig(directory[:-1] + "Activation Times by Senders.png")
    st.pyplot(plt.gcf())