# -*- coding: utf-8 -*- """ Created on Wed Feb 28 18:10:36 2024 @author: STEFAN """ #Import all libraries import pandas as pd import re from wordcloud import WordCloud from collections import Counter import matplotlib.pyplot as plt import nltk nltk.download('stopwords') nltk.download('punkt') nltk.download('vader_lexicon') from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.sentiment.vader import SentimentIntensityAnalyzer #import nltk #from nltk.stem import PorterStemmer #from gensim import corpora, models #import numpy as np import matplotlib.colors as mcolors import random import calendar import string import streamlit as st def WhatsappAnalysis(df,nombregrupo,savefigs=False,directory=''): #file = "WhatsApp Chat with Test Group.txt" #If you do not want to save figures, just comment the directory line and change savefigs to False #directory = "/Users/selbl/Downloads//" #savefigs = False #Get stop words for filtering #stopwords = nltk.corpus.stopwords.words('english') stop_words = set(stopwords.words('english')) #stop_words = nltk.corpus.stopwords.words('english') #Get special characters for filtering #Create dictionary with the items to be deleted del_dict = {sp_character: '' for sp_character in string.punctuation} #Add the space character del_dict[' '] = '' #Create reference table table = str.maketrans(del_dict) #Create filtered database dfFilter = df #Obtain length of the filtered database (important for later) largo = len(dfFilter) #Iterate over sample to eliminate stop words for i in range(largo): #Extract message message = df.at[i,"Message"] #Tokenize word_tokens = word_tokenize(message) #Filter stop words filtered_sentence = [w for w in word_tokens if not w in stop_words] #This filter still returns certain punctation and special characters which we do not care about #Hence why I get rid of them filtered_sentence = [val.translate(table) for val in filtered_sentence] #Create coherent message fd = "" for x in filtered_sentence: fd = fd + x + " " #Remove final space fd = fd[:-1] #Add in database dfFilter.at[i,"Message"] = fd dfFilter['Sender'] = dfFilter['Sender'].str.split().str[0] Senders = list(set([sender for sender in dfFilter['Sender']])) colours = list(mcolors.CSS4_COLORS.keys()) transparent_colours = ['lightgray','lightgrey','whitesmoke','white','snow','mistyrose','lightcoral','seashell', 'peachpuff','linen','bisque','antiquewhite','papayawhip','bisque','lemonchiffon','beige','blanchealmond','oldlace', 'floralwhite','cornsilk','ivory','lightyellow','lightgoldenrodyellow','honeydew', 'mintcream','azure','lightcyan','lightskyblue','aliceblue','lightsteelblue','ghostwhite', 'lavender','lavenderblush','lightpink'] #eliminate them from consideration colours = [x for x in colours if x not in transparent_colours] def random_color_generator(colors): color = random.choice(colors) return color #Create the color scheme #Get number of senders num_senders = len(Senders) #Initialize the empty list that stores the colours colorscheme = random.sample(colours, num_senders) #Store colorscheme as a dictionary as well. This is to make plotting easier colour_dict = {} conti = 0 for sender in Senders: colour_dict[sender] = colorscheme[conti] conti = conti + 1 #Obtain the first year by looking at the first message, same with first month splitline_date = re.split('/',dfFilter.iloc[0]['Date']) anio = 2000 + int(splitline_date[2]) mes = calendar.month_name[int(splitline_date[0])] #Get a graph of different messages by sender #Get the number of messages NumMessages = [sum(dfFilter['Sender'] == sender) for sender in Senders] #Generate the plot fig = plt.figure() ax = fig.add_axes([0,0,1,1]) ax.set_ylabel('Messages') ax.set_xlabel('Sender') ax.set_title(f'Number of Messages in {nombregrupo} by sender since {mes} of {anio}') ax.bar(Senders,NumMessages,color=colorscheme) if savefigs: plt.savefig(directory[:-1] + "Global Messages.png") st.pyplot(fig) #Yearly segmentation #Obtain the evolution of messages per year #Define years #obtain last year from og year c = dfFilter['Date'].str.extract('.*(\d{2})', expand = False) last_year = 2000 + int(c.iloc[-1]) #Obtain range of years diff_year = last_year - anio + 1 #Obtain distinct last digit of years years = [anio + x for x in range(0,diff_year)] #Define dictionary vals2 = {} #Obtain counts for year in years: vals2[year] = {} #subset dataframealse) cont = Counter(dfFilter[dfFilter['Date'].str.extract('.*(\d{2})', expand = False) == str(year - 2000)]['Sender']).most_common(len(Senders)) #Check if somebody is missing #Check if length is different if len(cont) < len(Senders): #If so, find which ones are not in the count PresentSenders = [cont[i][0] for i in range(len(cont))] for sender in Senders: if sender not in PresentSenders: cont.append((sender,0)) #Obtain return list for i in range(len(cont)): #Get sender send = cont[i][0] #Get number of messages nro = cont[i][1] #Add to dictionary vals2[year][send] = nro #Plot the evolution fig = plt.figure() for sender in Senders: #Get messages: messages = [vals2[year][sender] for year in years] plt.plot(years,messages, label=sender, color = colour_dict[sender]) fig.suptitle('Evolution of Number of Messages Sent per Year', fontsize=12) plt.xticks(range(anio,last_year+1)) plt.xlabel('Year', fontsize=12) plt.ylabel('Messages', fontsize=12) plt.legend(loc="best") if savefigs: plt.savefig(directory[:-1]+ "Evolution Messages.png") st.pyplot(fig) #Global Word Cloud all_words = '' #looping through all incidents and joining them to one text, to extract most common words for arg in dfFilter['Message']: tokens = arg.split() #Remove media and omitted if they exist if ('Media' in tokens): tokens.remove('Media') if ('omitted' in tokens): tokens.remove('omitted') ''' The tokens below appeared a lot during testing My guess is because they appeared when tokenizing a plural word or because they represent Whatsapp data that cannot be represented into text (such as a sticker). In any case, the code works well with this ''' if ('s' in tokens): tokens.remove('s') if ('m' in tokens): tokens.remove('m') #Join all_words += " ".join(tokens)+" " wordcloud = WordCloud(width = 700, height = 700, background_color ='white', min_font_size = 10,normalize_plurals=False).generate(all_words) # plot the WordCloud image plt.figure(figsize = (5, 5), facecolor = None) plt.title("Global Word Cloud since " + mes + " of " + str(anio)) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad = 0) if savefigs: plt.savefig(directory[:-1] + "WordCloud Global.png") st.pyplot(plt.gcf()) #One worldcloud per person #Obtain senders for sender in Senders: all_words = '' #Create subdf with only their messages dfFilter2 = dfFilter[dfFilter['Sender'] == sender] #looping through all incidences and joining them to one text, to extract most common words for arg in dfFilter2['Message']: tokens = arg.split() #Remove media and omitted if they exist if ('Media' in tokens): tokens.remove('Media') if ('omitted' in tokens): tokens.remove('omitted') #Same comment as in the cell above if ('s' in tokens): tokens.remove('s') if ('m' in tokens): tokens.remove('m') all_words += " ".join(tokens)+" " wordcloud = WordCloud(width = 700, height = 700, background_color ='white', min_font_size = 10).generate(all_words) plt.figure(figsize = (5, 5), facecolor = None) plt.imshow(wordcloud) plt.title(f"Word cloud for {sender} from {mes} of {anio}") plt.axis("off") plt.tight_layout(pad = 0) if savefigs: plt.savefig(directory[:-1] + "WordCloud for" + sender + ".png") st.pyplot(plt.gcf()) #Conduct sentiment analysis # Initialize the VADER Sentiment Intensity Analyzer sid = SentimentIntensityAnalyzer() # Function to get sentiment polarity def get_sentiment_polarity(message): sentiment_scores = sid.polarity_scores(message) return 'positive' if sentiment_scores['compound'] >= 0.05 else 'negative' if sentiment_scores['compound'] <= -0.05 else 'neutral' # Apply the sentiment analysis to the 'Message' column dfFilter['Sentiment'] = dfFilter['Message'].apply(get_sentiment_polarity) # Calculate the percentage of each sentiment for all messages sentiment_percentage_all = dfFilter['Sentiment'].value_counts(normalize=True) * 100 # Calculate the percentage of each sentiment per sender sentiment_percentage_per_sender = dfFilter.groupby(['Sender', 'Sentiment']).size().unstack(fill_value=0) sentiment_percentage_per_sender = sentiment_percentage_per_sender.div(sentiment_percentage_per_sender.sum(axis=1), axis=0) * 100 # Display the results print("Sentiment Percentage for All Messages:") print(sentiment_percentage_all) print("\nSentiment Percentage per Sender:") print(sentiment_percentage_per_sender) # Calculate the percentage of each sentiment per year per sender #Create year column dfFilter['Year'] = pd.to_datetime(dfFilter['Date']).dt.year #Get it by year sentiment_percentage_per_year_per_sender = dfFilter.groupby(['Year', 'Sender', 'Sentiment']).size().unstack(fill_value=0) sentiment_percentage_per_year_per_sender = sentiment_percentage_per_year_per_sender.div(sentiment_percentage_per_year_per_sender.sum(axis=1), axis=0) * 100 # Plot the results using Matplotlib for sender in dfFilter['Sender'].unique(): data = sentiment_percentage_per_year_per_sender.loc[(slice(None), sender), :] ax = data.unstack().plot(kind='bar', stacked=True, title=f'Sentiment Percentage per Year for {sender}') plt.ylabel('Percentage') plt.xlabel('Year') plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left') if savefigs: plt.savefig(directory[:-1] + f"Sentiment Analysis for {sender}.png") st.pyplot(plt.gcf()) #Define senders to do more specific analysis senders = dfFilter['Sender'].unique() for sender in senders: data = sentiment_percentage_per_year_per_sender.loc[(slice(None), sender), :].unstack() plt.figure(figsize=(10, 6)) for sentiment in ['positive', 'neutral', 'negative']: plt.plot(data[sentiment], label=sentiment.capitalize()) plt.title(f'Sentiment Percentage per Year for {sender}') plt.xlabel('Year') plt.ylabel('Percentage') plt.xticks(list(set(dfFilter['Year'])), rotation=45) # Set x-axis ticks to unique years plt.legend(title='Sentiment',loc="best") plt.grid(True) if savefigs: plt.savefig(directory[:-1] + f"Sentiment Percentage Line Evolution for {sender}.png") st.pyplot(plt.gcf()) # Assuming 'Time' is in the format 'HH:MM:SS'. If not, adjust the format accordingly. #dfFilter['Activation_Time'] = pd.to_datetime(dfFilter['Time'], format='%H:%M').dt.hour dfFilter['Activation_Time'] = pd.to_datetime(dfFilter['Time'], format='mixed').dt.hour # Plot a histogram of message activation times plt.figure(figsize=(10, 6)) plt.hist(dfFilter['Activation_Time'], bins=24, edgecolor='black', alpha=0.7) plt.title('Distribution of Message Activation Times') plt.xlabel('Activation Time (Hour of Day)') plt.ylabel('Number of Messages') plt.xticks(range(24)) plt.grid(True) if savefigs: plt.savefig(directory[:-1] + "Activation Times.png") st.pyplot(plt.gcf()) # Plot a histogram of message activation times plt.figure(figsize=(10, 6)) for sender in senders: plt.hist(dfFilter[dfFilter['Sender'] == sender]['Activation_Time'], bins=24, edgecolor='black',color = colour_dict[sender], alpha=0.7,label = sender) plt.title('Distribution of Message Activation Times') plt.xlabel('Activation Time (Hour of Day)') plt.ylabel('Number of Messages') plt.xticks(range(24)) plt.legend(loc='upper left') plt.grid(True) if savefigs: plt.savefig(directory[:-1] + "Activation Times by Senders.png") st.pyplot(plt.gcf())