Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Wed Feb 28 18:10:36 2024 | |
| @author: STEFAN | |
| """ | |
| #Import all libraries | |
| import pandas as pd | |
| import re | |
| from wordcloud import WordCloud | |
| from collections import Counter | |
| import matplotlib.pyplot as plt | |
| import nltk | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| nltk.download('vader_lexicon') | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.sentiment.vader import SentimentIntensityAnalyzer | |
| #import nltk | |
| #from nltk.stem import PorterStemmer | |
| #from gensim import corpora, models | |
| #import numpy as np | |
| import matplotlib.colors as mcolors | |
| import random | |
| import calendar | |
| import string | |
| import streamlit as st | |
| def WhatsappAnalysis(df,nombregrupo,savefigs=False,directory=''): | |
| #file = "WhatsApp Chat with Test Group.txt" | |
| #If you do not want to save figures, just comment the directory line and change savefigs to False | |
| #directory = "/Users/selbl/Downloads//" | |
| #savefigs = False | |
| #Get stop words for filtering | |
| #stopwords = nltk.corpus.stopwords.words('english') | |
| stop_words = set(stopwords.words('english')) | |
| #stop_words = nltk.corpus.stopwords.words('english') | |
| #Get special characters for filtering | |
| #Create dictionary with the items to be deleted | |
| del_dict = {sp_character: '' for sp_character in string.punctuation} | |
| #Add the space character | |
| del_dict[' '] = '' | |
| #Create reference table | |
| table = str.maketrans(del_dict) | |
| #Create filtered database | |
| dfFilter = df | |
| #Obtain length of the filtered database (important for later) | |
| largo = len(dfFilter) | |
| #Iterate over sample to eliminate stop words | |
| for i in range(largo): | |
| #Extract message | |
| message = df.at[i,"Message"] | |
| #Tokenize | |
| word_tokens = word_tokenize(message) | |
| #Filter stop words | |
| filtered_sentence = [w for w in word_tokens if not w in stop_words] | |
| #This filter still returns certain punctation and special characters which we do not care about | |
| #Hence why I get rid of them | |
| filtered_sentence = [val.translate(table) for val in filtered_sentence] | |
| #Create coherent message | |
| fd = "" | |
| for x in filtered_sentence: | |
| fd = fd + x + " " | |
| #Remove final space | |
| fd = fd[:-1] | |
| #Add in database | |
| dfFilter.at[i,"Message"] = fd | |
| dfFilter['Sender'] = dfFilter['Sender'].str.split().str[0] | |
| Senders = list(set([sender for sender in dfFilter['Sender']])) | |
| colours = list(mcolors.CSS4_COLORS.keys()) | |
| transparent_colours = ['lightgray','lightgrey','whitesmoke','white','snow','mistyrose','lightcoral','seashell', | |
| 'peachpuff','linen','bisque','antiquewhite','papayawhip','bisque','lemonchiffon','beige','blanchealmond','oldlace', | |
| 'floralwhite','cornsilk','ivory','lightyellow','lightgoldenrodyellow','honeydew', | |
| 'mintcream','azure','lightcyan','lightskyblue','aliceblue','lightsteelblue','ghostwhite', | |
| 'lavender','lavenderblush','lightpink'] | |
| #eliminate them from consideration | |
| colours = [x for x in colours if x not in transparent_colours] | |
| def random_color_generator(colors): | |
| color = random.choice(colors) | |
| return color | |
| #Create the color scheme | |
| #Get number of senders | |
| num_senders = len(Senders) | |
| #Initialize the empty list that stores the colours | |
| colorscheme = random.sample(colours, num_senders) | |
| #Store colorscheme as a dictionary as well. This is to make plotting easier | |
| colour_dict = {} | |
| conti = 0 | |
| for sender in Senders: | |
| colour_dict[sender] = colorscheme[conti] | |
| conti = conti + 1 | |
| #Obtain the first year by looking at the first message, same with first month | |
| splitline_date = re.split('/',dfFilter.iloc[0]['Date']) | |
| anio = 2000 + int(splitline_date[2]) | |
| mes = calendar.month_name[int(splitline_date[0])] | |
| #Get a graph of different messages by sender | |
| #Get the number of messages | |
| NumMessages = [sum(dfFilter['Sender'] == sender) for sender in Senders] | |
| #Generate the plot | |
| fig = plt.figure() | |
| ax = fig.add_axes([0,0,1,1]) | |
| ax.set_ylabel('Messages') | |
| ax.set_xlabel('Sender') | |
| ax.set_title(f'Number of Messages in {nombregrupo} by sender since {mes} of {anio}') | |
| ax.bar(Senders,NumMessages,color=colorscheme) | |
| if savefigs: | |
| plt.savefig(directory[:-1] + "Global Messages.png") | |
| st.pyplot(fig) | |
| #Yearly segmentation | |
| #Obtain the evolution of messages per year | |
| #Define years | |
| #obtain last year from og year | |
| c = dfFilter['Date'].str.extract('.*(\d{2})', expand = False) | |
| last_year = 2000 + int(c.iloc[-1]) | |
| #Obtain range of years | |
| diff_year = last_year - anio + 1 | |
| #Obtain distinct last digit of years | |
| years = [anio + x for x in range(0,diff_year)] | |
| #Define dictionary | |
| vals2 = {} | |
| #Obtain counts | |
| for year in years: | |
| vals2[year] = {} | |
| #subset dataframealse) | |
| cont = Counter(dfFilter[dfFilter['Date'].str.extract('.*(\d{2})', expand = False) == str(year - 2000)]['Sender']).most_common(len(Senders)) | |
| #Check if somebody is missing | |
| #Check if length is different | |
| if len(cont) < len(Senders): | |
| #If so, find which ones are not in the count | |
| PresentSenders = [cont[i][0] for i in range(len(cont))] | |
| for sender in Senders: | |
| if sender not in PresentSenders: | |
| cont.append((sender,0)) | |
| #Obtain return list | |
| for i in range(len(cont)): | |
| #Get sender | |
| send = cont[i][0] | |
| #Get number of messages | |
| nro = cont[i][1] | |
| #Add to dictionary | |
| vals2[year][send] = nro | |
| #Plot the evolution | |
| fig = plt.figure() | |
| for sender in Senders: | |
| #Get messages: | |
| messages = [vals2[year][sender] for year in years] | |
| plt.plot(years,messages, label=sender, color = colour_dict[sender]) | |
| fig.suptitle('Evolution of Number of Messages Sent per Year', fontsize=12) | |
| plt.xticks(range(anio,last_year+1)) | |
| plt.xlabel('Year', fontsize=12) | |
| plt.ylabel('Messages', fontsize=12) | |
| plt.legend(loc="best") | |
| if savefigs: | |
| plt.savefig(directory[:-1]+ "Evolution Messages.png") | |
| st.pyplot(fig) | |
| #Global Word Cloud | |
| all_words = '' | |
| #looping through all incidents and joining them to one text, to extract most common words | |
| for arg in dfFilter['Message']: | |
| tokens = arg.split() | |
| #Remove media and omitted if they exist | |
| if ('Media' in tokens): | |
| tokens.remove('Media') | |
| if ('omitted' in tokens): | |
| tokens.remove('omitted') | |
| ''' | |
| The tokens below appeared a lot during testing | |
| My guess is because they appeared when tokenizing a plural word or because they represent Whatsapp data that cannot be | |
| represented into text (such as a sticker). | |
| In any case, the code works well with this | |
| ''' | |
| if ('s' in tokens): | |
| tokens.remove('s') | |
| if ('m' in tokens): | |
| tokens.remove('m') | |
| #Join | |
| all_words += " ".join(tokens)+" " | |
| wordcloud = WordCloud(width = 700, height = 700, | |
| background_color ='white', | |
| min_font_size = 10,normalize_plurals=False).generate(all_words) | |
| # plot the WordCloud image | |
| plt.figure(figsize = (5, 5), facecolor = None) | |
| plt.title("Global Word Cloud since " + mes + " of " + str(anio)) | |
| plt.imshow(wordcloud) | |
| plt.axis("off") | |
| plt.tight_layout(pad = 0) | |
| if savefigs: | |
| plt.savefig(directory[:-1] + "WordCloud Global.png") | |
| st.pyplot(plt.gcf()) | |
| #One worldcloud per person | |
| #Obtain senders | |
| for sender in Senders: | |
| all_words = '' | |
| #Create subdf with only their messages | |
| dfFilter2 = dfFilter[dfFilter['Sender'] == sender] | |
| #looping through all incidences and joining them to one text, to extract most common words | |
| for arg in dfFilter2['Message']: | |
| tokens = arg.split() | |
| #Remove media and omitted if they exist | |
| if ('Media' in tokens): | |
| tokens.remove('Media') | |
| if ('omitted' in tokens): | |
| tokens.remove('omitted') | |
| #Same comment as in the cell above | |
| if ('s' in tokens): | |
| tokens.remove('s') | |
| if ('m' in tokens): | |
| tokens.remove('m') | |
| all_words += " ".join(tokens)+" " | |
| wordcloud = WordCloud(width = 700, height = 700, | |
| background_color ='white', | |
| min_font_size = 10).generate(all_words) | |
| plt.figure(figsize = (5, 5), facecolor = None) | |
| plt.imshow(wordcloud) | |
| plt.title(f"Word cloud for {sender} from {mes} of {anio}") | |
| plt.axis("off") | |
| plt.tight_layout(pad = 0) | |
| if savefigs: | |
| plt.savefig(directory[:-1] + "WordCloud for" + sender + ".png") | |
| st.pyplot(plt.gcf()) | |
| #Conduct sentiment analysis | |
| # Initialize the VADER Sentiment Intensity Analyzer | |
| sid = SentimentIntensityAnalyzer() | |
| # Function to get sentiment polarity | |
| def get_sentiment_polarity(message): | |
| sentiment_scores = sid.polarity_scores(message) | |
| return 'positive' if sentiment_scores['compound'] >= 0.05 else 'negative' if sentiment_scores['compound'] <= -0.05 else 'neutral' | |
| # Apply the sentiment analysis to the 'Message' column | |
| dfFilter['Sentiment'] = dfFilter['Message'].apply(get_sentiment_polarity) | |
| # Calculate the percentage of each sentiment for all messages | |
| sentiment_percentage_all = dfFilter['Sentiment'].value_counts(normalize=True) * 100 | |
| # Calculate the percentage of each sentiment per sender | |
| sentiment_percentage_per_sender = dfFilter.groupby(['Sender', 'Sentiment']).size().unstack(fill_value=0) | |
| sentiment_percentage_per_sender = sentiment_percentage_per_sender.div(sentiment_percentage_per_sender.sum(axis=1), axis=0) * 100 | |
| # Display the results | |
| print("Sentiment Percentage for All Messages:") | |
| print(sentiment_percentage_all) | |
| print("\nSentiment Percentage per Sender:") | |
| print(sentiment_percentage_per_sender) | |
| # Calculate the percentage of each sentiment per year per sender | |
| #Create year column | |
| dfFilter['Year'] = pd.to_datetime(dfFilter['Date']).dt.year | |
| #Get it by year | |
| sentiment_percentage_per_year_per_sender = dfFilter.groupby(['Year', 'Sender', 'Sentiment']).size().unstack(fill_value=0) | |
| sentiment_percentage_per_year_per_sender = sentiment_percentage_per_year_per_sender.div(sentiment_percentage_per_year_per_sender.sum(axis=1), axis=0) * 100 | |
| # Plot the results using Matplotlib | |
| for sender in dfFilter['Sender'].unique(): | |
| data = sentiment_percentage_per_year_per_sender.loc[(slice(None), sender), :] | |
| ax = data.unstack().plot(kind='bar', stacked=True, title=f'Sentiment Percentage per Year for {sender}') | |
| plt.ylabel('Percentage') | |
| plt.xlabel('Year') | |
| plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left') | |
| if savefigs: | |
| plt.savefig(directory[:-1] + f"Sentiment Analysis for {sender}.png") | |
| st.pyplot(plt.gcf()) | |
| #Define senders to do more specific analysis | |
| senders = dfFilter['Sender'].unique() | |
| for sender in senders: | |
| data = sentiment_percentage_per_year_per_sender.loc[(slice(None), sender), :].unstack() | |
| plt.figure(figsize=(10, 6)) | |
| for sentiment in ['positive', 'neutral', 'negative']: | |
| plt.plot(data[sentiment], label=sentiment.capitalize()) | |
| plt.title(f'Sentiment Percentage per Year for {sender}') | |
| plt.xlabel('Year') | |
| plt.ylabel('Percentage') | |
| plt.xticks(list(set(dfFilter['Year'])), rotation=45) # Set x-axis ticks to unique years | |
| plt.legend(title='Sentiment',loc="best") | |
| plt.grid(True) | |
| if savefigs: | |
| plt.savefig(directory[:-1] + f"Sentiment Percentage Line Evolution for {sender}.png") | |
| st.pyplot(plt.gcf()) | |
| # Assuming 'Time' is in the format 'HH:MM:SS'. If not, adjust the format accordingly. | |
| #dfFilter['Activation_Time'] = pd.to_datetime(dfFilter['Time'], format='%H:%M').dt.hour | |
| dfFilter['Activation_Time'] = pd.to_datetime(dfFilter['Time'], format='mixed').dt.hour | |
| # Plot a histogram of message activation times | |
| plt.figure(figsize=(10, 6)) | |
| plt.hist(dfFilter['Activation_Time'], bins=24, edgecolor='black', alpha=0.7) | |
| plt.title('Distribution of Message Activation Times') | |
| plt.xlabel('Activation Time (Hour of Day)') | |
| plt.ylabel('Number of Messages') | |
| plt.xticks(range(24)) | |
| plt.grid(True) | |
| if savefigs: | |
| plt.savefig(directory[:-1] + "Activation Times.png") | |
| st.pyplot(plt.gcf()) | |
| # Plot a histogram of message activation times | |
| plt.figure(figsize=(10, 6)) | |
| for sender in senders: | |
| plt.hist(dfFilter[dfFilter['Sender'] == sender]['Activation_Time'], bins=24, edgecolor='black',color = colour_dict[sender], alpha=0.7,label = sender) | |
| plt.title('Distribution of Message Activation Times') | |
| plt.xlabel('Activation Time (Hour of Day)') | |
| plt.ylabel('Number of Messages') | |
| plt.xticks(range(24)) | |
| plt.legend(loc='upper left') | |
| plt.grid(True) | |
| if savefigs: | |
| plt.savefig(directory[:-1] + "Activation Times by Senders.png") | |
| st.pyplot(plt.gcf()) | |