Spaces:

selbl
/

WhatsappAnalysis

Sleeping

App Files Files Community

WhatsappAnalysis / script.py

selbl

Added support for AM and PM time formats

5c3185b verified 8 months ago

raw

history blame contribute delete

13.7 kB

	# -- coding: utf-8 --
	"""
	Created on Wed Feb 28 18:10:36 2024

	@author: STEFAN
	"""

	#Import all libraries
	import pandas as pd
	import re
	from wordcloud import WordCloud
	from collections import Counter
	import matplotlib.pyplot as plt
	import nltk
	nltk.download('stopwords')
	nltk.download('punkt')
	nltk.download('vader_lexicon')
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.sentiment.vader import SentimentIntensityAnalyzer
	#import nltk
	#from nltk.stem import PorterStemmer
	#from gensim import corpora, models
	#import numpy as np
	import matplotlib.colors as mcolors
	import random
	import calendar
	import string
	import streamlit as st

	def WhatsappAnalysis(df,nombregrupo,savefigs=False,directory=''):
	#file = "WhatsApp Chat with Test Group.txt"
	#If you do not want to save figures, just comment the directory line and change savefigs to False
	#directory = "/Users/selbl/Downloads//"
	#savefigs = False
	#Get stop words for filtering
	#stopwords = nltk.corpus.stopwords.words('english')
	stop_words = set(stopwords.words('english'))
	#stop_words = nltk.corpus.stopwords.words('english')
	#Get special characters for filtering
	#Create dictionary with the items to be deleted
	del_dict = {sp_character: '' for sp_character in string.punctuation}
	#Add the space character
	del_dict[' '] = ''
	#Create reference table
	table = str.maketrans(del_dict)
	#Create filtered database
	dfFilter = df
	#Obtain length of the filtered database (important for later)
	largo = len(dfFilter)
	#Iterate over sample to eliminate stop words
	for i in range(largo):
	#Extract message
	message = df.at[i,"Message"]
	#Tokenize
	word_tokens = word_tokenize(message)
	#Filter stop words
	filtered_sentence = [w for w in word_tokens if not w in stop_words]
	#This filter still returns certain punctation and special characters which we do not care about
	#Hence why I get rid of them
	filtered_sentence = [val.translate(table) for val in filtered_sentence]
	#Create coherent message
	fd = ""
	for x in filtered_sentence:
	fd = fd + x + " "
	#Remove final space
	fd = fd[:-1]
	#Add in database
	dfFilter.at[i,"Message"] = fd

	dfFilter['Sender'] = dfFilter['Sender'].str.split().str[0]

	Senders = list(set([sender for sender in dfFilter['Sender']]))

	colours = list(mcolors.CSS4_COLORS.keys())
	transparent_colours = ['lightgray','lightgrey','whitesmoke','white','snow','mistyrose','lightcoral','seashell',
	'peachpuff','linen','bisque','antiquewhite','papayawhip','bisque','lemonchiffon','beige','blanchealmond','oldlace',
	'floralwhite','cornsilk','ivory','lightyellow','lightgoldenrodyellow','honeydew',
	'mintcream','azure','lightcyan','lightskyblue','aliceblue','lightsteelblue','ghostwhite',
	'lavender','lavenderblush','lightpink']
	#eliminate them from consideration
	colours = [x for x in colours if x not in transparent_colours]

	def random_color_generator(colors):
	color = random.choice(colors)
	return color


	#Create the color scheme
	#Get number of senders
	num_senders = len(Senders)
	#Initialize the empty list that stores the colours
	colorscheme = random.sample(colours, num_senders)
	#Store colorscheme as a dictionary as well. This is to make plotting easier
	colour_dict = {}
	conti = 0
	for sender in Senders:
	colour_dict[sender] = colorscheme[conti]
	conti = conti + 1

	#Obtain the first year by looking at the first message, same with first month
	splitline_date = re.split('/',dfFilter.iloc[0]['Date'])
	anio = 2000 + int(splitline_date[2])
	mes = calendar.month_name[int(splitline_date[0])]

	#Get a graph of different messages by sender
	#Get the number of messages
	NumMessages = [sum(dfFilter['Sender'] == sender) for sender in Senders]
	#Generate the plot
	fig = plt.figure()
	ax = fig.add_axes([0,0,1,1])
	ax.set_ylabel('Messages')
	ax.set_xlabel('Sender')
	ax.set_title(f'Number of Messages in {nombregrupo} by sender since {mes} of {anio}')
	ax.bar(Senders,NumMessages,color=colorscheme)
	if savefigs:
	plt.savefig(directory[:-1] + "Global Messages.png")
	st.pyplot(fig)

	#Yearly segmentation
	#Obtain the evolution of messages per year
	#Define years
	#obtain last year from og year
	c = dfFilter['Date'].str.extract('.*(\d{2})', expand = False)
	last_year = 2000 + int(c.iloc[-1])
	#Obtain range of years
	diff_year = last_year - anio + 1

	#Obtain distinct last digit of years
	years = [anio + x for x in range(0,diff_year)]
	#Define dictionary
	vals2 = {}
	#Obtain counts
	for year in years:
	vals2[year] = {}
	#subset dataframealse)
	cont = Counter(dfFilter[dfFilter['Date'].str.extract('.*(\d{2})', expand = False) == str(year - 2000)]['Sender']).most_common(len(Senders))
	#Check if somebody is missing
	#Check if length is different
	if len(cont) < len(Senders):
	#If so, find which ones are not in the count
	PresentSenders = [cont[i][0] for i in range(len(cont))]
	for sender in Senders:
	if sender not in PresentSenders:
	cont.append((sender,0))
	#Obtain return list
	for i in range(len(cont)):
	#Get sender
	send = cont[i][0]
	#Get number of messages
	nro = cont[i][1]
	#Add to dictionary
	vals2[year][send] = nro

	#Plot the evolution
	fig = plt.figure()
	for sender in Senders:
	#Get messages:
	messages = [vals2[year][sender] for year in years]
	plt.plot(years,messages, label=sender, color = colour_dict[sender])

	fig.suptitle('Evolution of Number of Messages Sent per Year', fontsize=12)
	plt.xticks(range(anio,last_year+1))
	plt.xlabel('Year', fontsize=12)
	plt.ylabel('Messages', fontsize=12)
	plt.legend(loc="best")
	if savefigs:
	plt.savefig(directory[:-1]+ "Evolution Messages.png")
	st.pyplot(fig)

	#Global Word Cloud

	all_words = ''

	#looping through all incidents and joining them to one text, to extract most common words
	for arg in dfFilter['Message']:
	tokens = arg.split()
	#Remove media and omitted if they exist
	if ('Media' in tokens):
	tokens.remove('Media')
	if ('omitted' in tokens):
	tokens.remove('omitted')
	'''
	The tokens below appeared a lot during testing
	My guess is because they appeared when tokenizing a plural word or because they represent Whatsapp data that cannot be
	represented into text (such as a sticker).

	In any case, the code works well with this
	'''
	if ('s' in tokens):
	tokens.remove('s')
	if ('m' in tokens):
	tokens.remove('m')
	#Join
	all_words += " ".join(tokens)+" "

	wordcloud = WordCloud(width = 700, height = 700,
	background_color ='white',
	min_font_size = 10,normalize_plurals=False).generate(all_words)

	# plot the WordCloud image
	plt.figure(figsize = (5, 5), facecolor = None)
	plt.title("Global Word Cloud since " + mes + " of " + str(anio))
	plt.imshow(wordcloud)
	plt.axis("off")
	plt.tight_layout(pad = 0)
	if savefigs:
	plt.savefig(directory[:-1] + "WordCloud Global.png")
	st.pyplot(plt.gcf())

	#One worldcloud per person
	#Obtain senders
	for sender in Senders:
	all_words = ''
	#Create subdf with only their messages
	dfFilter2 = dfFilter[dfFilter['Sender'] == sender]
	#looping through all incidences and joining them to one text, to extract most common words
	for arg in dfFilter2['Message']:
	tokens = arg.split()
	#Remove media and omitted if they exist
	if ('Media' in tokens):
	tokens.remove('Media')
	if ('omitted' in tokens):
	tokens.remove('omitted')
	#Same comment as in the cell above
	if ('s' in tokens):
	tokens.remove('s')
	if ('m' in tokens):
	tokens.remove('m')

	all_words += " ".join(tokens)+" "

	wordcloud = WordCloud(width = 700, height = 700,
	background_color ='white',
	min_font_size = 10).generate(all_words)
	plt.figure(figsize = (5, 5), facecolor = None)
	plt.imshow(wordcloud)
	plt.title(f"Word cloud for {sender} from {mes} of {anio}")
	plt.axis("off")
	plt.tight_layout(pad = 0)
	if savefigs:
	plt.savefig(directory[:-1] + "WordCloud for" + sender + ".png")
	st.pyplot(plt.gcf())

	#Conduct sentiment analysis
	# Initialize the VADER Sentiment Intensity Analyzer
	sid = SentimentIntensityAnalyzer()

	# Function to get sentiment polarity
	def get_sentiment_polarity(message):
	sentiment_scores = sid.polarity_scores(message)
	return 'positive' if sentiment_scores['compound'] >= 0.05 else 'negative' if sentiment_scores['compound'] <= -0.05 else 'neutral'

	# Apply the sentiment analysis to the 'Message' column
	dfFilter['Sentiment'] = dfFilter['Message'].apply(get_sentiment_polarity)

	# Calculate the percentage of each sentiment for all messages
	sentiment_percentage_all = dfFilter['Sentiment'].value_counts(normalize=True) * 100

	# Calculate the percentage of each sentiment per sender
	sentiment_percentage_per_sender = dfFilter.groupby(['Sender', 'Sentiment']).size().unstack(fill_value=0)
	sentiment_percentage_per_sender = sentiment_percentage_per_sender.div(sentiment_percentage_per_sender.sum(axis=1), axis=0) * 100

	# Display the results
	print("Sentiment Percentage for All Messages:")
	print(sentiment_percentage_all)
	print("\nSentiment Percentage per Sender:")
	print(sentiment_percentage_per_sender)

	# Calculate the percentage of each sentiment per year per sender

	#Create year column
	dfFilter['Year'] = pd.to_datetime(dfFilter['Date']).dt.year

	#Get it by year
	sentiment_percentage_per_year_per_sender = dfFilter.groupby(['Year', 'Sender', 'Sentiment']).size().unstack(fill_value=0)
	sentiment_percentage_per_year_per_sender = sentiment_percentage_per_year_per_sender.div(sentiment_percentage_per_year_per_sender.sum(axis=1), axis=0) * 100

	# Plot the results using Matplotlib
	for sender in dfFilter['Sender'].unique():
	data = sentiment_percentage_per_year_per_sender.loc[(slice(None), sender), :]
	ax = data.unstack().plot(kind='bar', stacked=True, title=f'Sentiment Percentage per Year for {sender}')
	plt.ylabel('Percentage')
	plt.xlabel('Year')
	plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')
	if savefigs:
	plt.savefig(directory[:-1] + f"Sentiment Analysis for {sender}.png")
	st.pyplot(plt.gcf())

	#Define senders to do more specific analysis
	senders = dfFilter['Sender'].unique()
	for sender in senders:
	data = sentiment_percentage_per_year_per_sender.loc[(slice(None), sender), :].unstack()

	plt.figure(figsize=(10, 6))

	for sentiment in ['positive', 'neutral', 'negative']:
	plt.plot(data[sentiment], label=sentiment.capitalize())

	plt.title(f'Sentiment Percentage per Year for {sender}')
	plt.xlabel('Year')
	plt.ylabel('Percentage')
	plt.xticks(list(set(dfFilter['Year'])), rotation=45) # Set x-axis ticks to unique years
	plt.legend(title='Sentiment',loc="best")
	plt.grid(True)
	if savefigs:
	plt.savefig(directory[:-1] + f"Sentiment Percentage Line Evolution for {sender}.png")
	st.pyplot(plt.gcf())

	# Assuming 'Time' is in the format 'HH:MM:SS'. If not, adjust the format accordingly.
	#dfFilter['Activation_Time'] = pd.to_datetime(dfFilter['Time'], format='%H:%M').dt.hour
	dfFilter['Activation_Time'] = pd.to_datetime(dfFilter['Time'], format='mixed').dt.hour
	# Plot a histogram of message activation times
	plt.figure(figsize=(10, 6))
	plt.hist(dfFilter['Activation_Time'], bins=24, edgecolor='black', alpha=0.7)
	plt.title('Distribution of Message Activation Times')
	plt.xlabel('Activation Time (Hour of Day)')
	plt.ylabel('Number of Messages')
	plt.xticks(range(24))
	plt.grid(True)
	if savefigs:
	plt.savefig(directory[:-1] + "Activation Times.png")
	st.pyplot(plt.gcf())

	# Plot a histogram of message activation times
	plt.figure(figsize=(10, 6))
	for sender in senders:
	plt.hist(dfFilter[dfFilter['Sender'] == sender]['Activation_Time'], bins=24, edgecolor='black',color = colour_dict[sender], alpha=0.7,label = sender)
	plt.title('Distribution of Message Activation Times')
	plt.xlabel('Activation Time (Hour of Day)')
	plt.ylabel('Number of Messages')
	plt.xticks(range(24))
	plt.legend(loc='upper left')
	plt.grid(True)
	if savefigs:
	plt.savefig(directory[:-1] + "Activation Times by Senders.png")
	st.pyplot(plt.gcf())