Spaces:
Build error
Build error
Irene G
commited on
Commit
·
ef0c1fa
1
Parent(s):
5398988
first
Browse files- app.py +158 -0
- preprocess.py +75 -0
- stats_graphs.py +136 -0
app.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import streamlit as st
|
4 |
+
import regex as re
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
|
7 |
+
import preprocess as prep
|
8 |
+
import stats_graphs as sts
|
9 |
+
|
10 |
+
|
11 |
+
st.sidebar.title('Analiza tu chat de WhatsApp')
|
12 |
+
|
13 |
+
# uploading the file
|
14 |
+
|
15 |
+
uploaded_file = st.sidebar.file_uploader('Por favor, sube aquí el archivo .txt del chat')
|
16 |
+
|
17 |
+
if uploaded_file is not None:
|
18 |
+
# extracting the text in bytes from file
|
19 |
+
bytes_data = uploaded_file.getvalue()
|
20 |
+
|
21 |
+
# transforming the bytes into text with decoder
|
22 |
+
data = bytes_data.decode('utf-8')
|
23 |
+
|
24 |
+
# preprocessing the text
|
25 |
+
df = prep.preprocess(data)
|
26 |
+
|
27 |
+
# displaying the dataframe
|
28 |
+
# st.dataframe(df)
|
29 |
+
|
30 |
+
# fetch unique users
|
31 |
+
user_list = df['User'].unique().tolist()
|
32 |
+
|
33 |
+
# removing the group notifications from users list and sort it
|
34 |
+
user_list.remove('Group Notification')
|
35 |
+
user_list.sort()
|
36 |
+
|
37 |
+
# 'General' at the 0 position of the index, for showcasing the overall chat group analysis by default
|
38 |
+
user_list.insert(0, 'General')
|
39 |
+
|
40 |
+
#
|
41 |
+
selected_user = st.sidebar.selectbox(
|
42 |
+
'Mostrar análisis para ', user_list)
|
43 |
+
|
44 |
+
st.title('Análisis del chat de Whats App para ' + selected_user)
|
45 |
+
if st.sidebar.button('Mostrar análisis'):
|
46 |
+
|
47 |
+
# getting the stats of the selected user
|
48 |
+
num_messages, num_words, media_omitted, links = sts.fetch_stats(
|
49 |
+
selected_user, df)
|
50 |
+
|
51 |
+
# we create 4 columns for the stats (messages, words, media and links)
|
52 |
+
col1, col2, col3, col4 = st.beta_columns(4)
|
53 |
+
|
54 |
+
with col1:
|
55 |
+
st.header('Nº de mensajes')
|
56 |
+
st.title(num_messages)
|
57 |
+
|
58 |
+
with col2:
|
59 |
+
st.header('Nº de palabras')
|
60 |
+
st.title(num_words)
|
61 |
+
|
62 |
+
with col3:
|
63 |
+
st.header('Nº de archivos compartidos')
|
64 |
+
st.title(media_omitted)
|
65 |
+
|
66 |
+
with col4:
|
67 |
+
st.header('Nº de enlaces compartidos')
|
68 |
+
st.title(links)
|
69 |
+
|
70 |
+
# activity of the users
|
71 |
+
if selected_user == 'General':
|
72 |
+
|
73 |
+
# dividing the space into two columns:
|
74 |
+
# first one for a bar chart with the top 5 most active users and second one for a df with percentage of total activity
|
75 |
+
|
76 |
+
st.title('Actividad de los usuarios')
|
77 |
+
activity_count, act_df = sts.fetch_activity_users(df)
|
78 |
+
|
79 |
+
# two plots, one for each column
|
80 |
+
fig, ax = plt.subplots()
|
81 |
+
col1, col2 = st.beta_columns(2)
|
82 |
+
with col1:
|
83 |
+
ax.bar(activity_count.index, activity_count.values, color='green')
|
84 |
+
plt.xticks(rotation='vertical')
|
85 |
+
st.pyplot(fig)
|
86 |
+
|
87 |
+
with col2:
|
88 |
+
st.dataframe(act_df)
|
89 |
+
|
90 |
+
# Word Cloud for selected user
|
91 |
+
st.title('Nube de palabras')
|
92 |
+
df_img = sts.create_wordcloud(selected_user, df)
|
93 |
+
fig, ax = plt.subplots()
|
94 |
+
ax.imshow(df_img)
|
95 |
+
st.pyplot(fig)
|
96 |
+
|
97 |
+
# most common words in the chat
|
98 |
+
most_common_df = sts.get_common_words(selected_user, df)
|
99 |
+
fig, ax = plt.subplots()
|
100 |
+
ax.barh(most_common_df[0], most_common_df[1])
|
101 |
+
plt.xticks(rotation='vertical')
|
102 |
+
st.title('Palabras más utilizadas')
|
103 |
+
st.pyplot(fig)
|
104 |
+
|
105 |
+
# Emoji Analysis
|
106 |
+
emoji_df = sts.get_emoji_stats(selected_user, df)
|
107 |
+
emoji_df.columns = ['Emoji', 'Total']
|
108 |
+
|
109 |
+
st.title('Análisis de emojis')
|
110 |
+
|
111 |
+
col1, col2 = st.beta_columns(2)
|
112 |
+
|
113 |
+
# count
|
114 |
+
with col1:
|
115 |
+
st.dataframe(emoji_df)
|
116 |
+
# percentage
|
117 |
+
with col2:
|
118 |
+
emoji_count = list(emoji_df['Total'])
|
119 |
+
perlist = [(i/sum(emoji_count))*100 for i in emoji_count]
|
120 |
+
emoji_df['Porcentaje'] = np.array(perlist)
|
121 |
+
st.dataframe(emoji_df)
|
122 |
+
|
123 |
+
# Monthly timeline
|
124 |
+
st.title('Actividad por mes')
|
125 |
+
time = sts.monthly_timeline(selected_user, df)
|
126 |
+
fig, ax = plt.subplots()
|
127 |
+
ax.plot(time['Time'], time['Message'], color='blue')
|
128 |
+
plt.xticks(rotation='vertical')
|
129 |
+
plt.tight_layout()
|
130 |
+
st.pyplot(fig)
|
131 |
+
|
132 |
+
# Activity maps: days and months
|
133 |
+
st.title('Mapas de actividad')
|
134 |
+
|
135 |
+
col1, col2 = st.beta_columns(2)
|
136 |
+
|
137 |
+
with col1:
|
138 |
+
|
139 |
+
st.header('Días de mayor actividad')
|
140 |
+
|
141 |
+
days = sts.weekly_activity(selected_user, df)
|
142 |
+
|
143 |
+
fig, ax = plt.subplots()
|
144 |
+
ax.bar(days.index, days.values, color='purple')
|
145 |
+
plt.xticks(rotation='vertical')
|
146 |
+
plt.tight_layout()
|
147 |
+
st.pyplot(fig)
|
148 |
+
|
149 |
+
with col2:
|
150 |
+
|
151 |
+
st.header('Meses de mayor actividad')
|
152 |
+
months = sts.monthly_activity(selected_user, df)
|
153 |
+
|
154 |
+
fig, ax = plt.subplots()
|
155 |
+
ax.bar(months.index, months.values, color='orange')
|
156 |
+
plt.xticks(rotation='vertical')
|
157 |
+
plt.tight_layout()
|
158 |
+
st.pyplot(fig)
|
preprocess.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import regex as re
|
5 |
+
import seaborn as sn
|
6 |
+
|
7 |
+
|
8 |
+
# function to separate time and date
|
9 |
+
def get_time_date(string):
|
10 |
+
string = string.split(',')
|
11 |
+
date, time = string[0], string[1]
|
12 |
+
time = time.split('-')
|
13 |
+
time = time[0].strip()
|
14 |
+
|
15 |
+
return date+" "+time
|
16 |
+
|
17 |
+
# removing '\n' from the 'Message' column
|
18 |
+
def get_string(text):
|
19 |
+
return text.split('\n')[0]
|
20 |
+
|
21 |
+
# final preprocessing function
|
22 |
+
def preprocess(data):
|
23 |
+
# splitting date, time and dash at the start of every line of text
|
24 |
+
pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
|
25 |
+
|
26 |
+
# separate dates from messages
|
27 |
+
messages = re.split(pattern, data)[1:]
|
28 |
+
dates = re.findall(pattern, data)
|
29 |
+
|
30 |
+
# put both in a dataframe
|
31 |
+
df = pd.DataFrame({'user_messages': messages,
|
32 |
+
'message_date': dates})
|
33 |
+
|
34 |
+
df['message_date'] = df['message_date'].apply(
|
35 |
+
lambda text: get_time_date(text))
|
36 |
+
df.rename(columns={'message_date': 'date'}, inplace=True)
|
37 |
+
|
38 |
+
# separation of the usernamane
|
39 |
+
users = []
|
40 |
+
messages = []
|
41 |
+
|
42 |
+
for message in df['user_messages']:
|
43 |
+
|
44 |
+
entry = re.split('([\w\W]+?):\s', message) # extracting the username
|
45 |
+
if entry[1:]:
|
46 |
+
users.append(entry[1])
|
47 |
+
messages.append(entry[2])
|
48 |
+
|
49 |
+
else:
|
50 |
+
users.append('Group Notification') # the group's notifications don't have linked messages
|
51 |
+
messages.append(entry[0])
|
52 |
+
|
53 |
+
df['User'] = users
|
54 |
+
df['Message'] = messages
|
55 |
+
|
56 |
+
df['Message'] = df['Message'].apply(lambda text: get_string(text))
|
57 |
+
df = df.drop(['user_messages'], axis=1)
|
58 |
+
|
59 |
+
df = df[['Message', 'Date', 'User']]
|
60 |
+
|
61 |
+
# df = df.rename(columns={'message': 'Message',
|
62 |
+
# 'date': 'Date'})
|
63 |
+
|
64 |
+
# splitting and type transformation for all the info contained in the 'date' column with datetime:
|
65 |
+
|
66 |
+
df['Only date'] = pd.to_datetime(df['Date']).dt.date
|
67 |
+
df['Year'] = pd.to_datetime(df['Date']).dt.year
|
68 |
+
df['Month_num'] = pd.to_datetime(df['Date']).dt.month
|
69 |
+
df['Month'] = pd.to_datetime(df['Date']).dt.month_name()
|
70 |
+
df['Day'] = pd.to_datetime(df['Date']).dt.day
|
71 |
+
df['Day_name'] = pd.to_datetime(df['Date']).dt.day_name()
|
72 |
+
df['Hour'] = pd.to_datetime(df['Date']).dt.hour
|
73 |
+
df['Minute'] = pd.to_datetime(df['Date']).dt.minute
|
74 |
+
|
75 |
+
return df
|
stats_graphs.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from collections import Counter
|
3 |
+
from wordcloud import WordCloud
|
4 |
+
from urlextract import URLExtract
|
5 |
+
from nltk.corpus import stopwords
|
6 |
+
import nltk
|
7 |
+
nltk.download('stopwords')
|
8 |
+
import emoji
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
def fetch_stats(selected_user, df):
|
14 |
+
|
15 |
+
# selecting a specific user
|
16 |
+
if selected_user != 'General':
|
17 |
+
df = df[df['User'] == selected_user]
|
18 |
+
|
19 |
+
# number of messages
|
20 |
+
num_messages = df.shape[0]
|
21 |
+
|
22 |
+
# number of words
|
23 |
+
words = []
|
24 |
+
for message in df['Message']:
|
25 |
+
words.extend(message.split())
|
26 |
+
|
27 |
+
# number of shared media files
|
28 |
+
media_ommitted = df[df['Message'] == '<Media omitted>']
|
29 |
+
|
30 |
+
# number of shared links
|
31 |
+
links = []
|
32 |
+
extract = URLExtract()
|
33 |
+
|
34 |
+
for message in df['Message']:
|
35 |
+
links.extend(extract.find_urls(message))
|
36 |
+
|
37 |
+
return num_messages, len(words), media_ommitted.shape[0], len(links)
|
38 |
+
|
39 |
+
|
40 |
+
# activity by user
|
41 |
+
def fetch_activity_users(df):
|
42 |
+
# top 5 most active users
|
43 |
+
df = df[df['User'] != 'Group Notification']
|
44 |
+
count = df['User'].value_counts().head()
|
45 |
+
|
46 |
+
# percentage of total activity
|
47 |
+
act_df = pd.DataFrame((df['User'].value_counts()/df.shape[0])*100)
|
48 |
+
return count, act_df
|
49 |
+
|
50 |
+
# Word Cloud
|
51 |
+
def create_wordcloud(selected_user, df):
|
52 |
+
|
53 |
+
if selected_user != 'Overall':
|
54 |
+
df = df[df['User'] == selected_user]
|
55 |
+
# generate the cloud
|
56 |
+
wc = WordCloud(width=500, height=500,
|
57 |
+
min_font_size=12, background_color='black')
|
58 |
+
# cut and concatenate the words from the 'Message' column
|
59 |
+
df_wc = wc.generate(df['Message'].str.cat(sep=" "))
|
60 |
+
|
61 |
+
return df_wc
|
62 |
+
|
63 |
+
|
64 |
+
# get the 20 most common words
|
65 |
+
def get_common_words(selected_user, df):
|
66 |
+
|
67 |
+
# getting the stopwords
|
68 |
+
|
69 |
+
# file = open('stop_hinglish.txt', 'r')
|
70 |
+
stopwords = stopwords.words('spanish')
|
71 |
+
# stopwords = stopwords.split('\n')
|
72 |
+
|
73 |
+
if selected_user != 'Overall':
|
74 |
+
df = df[df['User'] == selected_user]
|
75 |
+
|
76 |
+
timeline = df[(df['User'] != 'Group Notification') |
|
77 |
+
(df['User'] != '<Media omitted>')]
|
78 |
+
|
79 |
+
words = []
|
80 |
+
|
81 |
+
for message in timeline['Message']:
|
82 |
+
for word in message.lower().split():
|
83 |
+
if word not in stopwords:
|
84 |
+
words.append(word)
|
85 |
+
|
86 |
+
top_20_w = pd.DataFrame(Counter(words).most_common(20))
|
87 |
+
return top_20_w
|
88 |
+
|
89 |
+
# get the most used emojis
|
90 |
+
def get_emoji_stats(selected_user, df):
|
91 |
+
|
92 |
+
if selected_user != 'Overall':
|
93 |
+
df = df[df['User'] == selected_user]
|
94 |
+
|
95 |
+
emojis = []
|
96 |
+
|
97 |
+
for message in df['Message']:
|
98 |
+
emojis.extend([c for c in message if c in emoji.UNICODE_EMOJI['en']])
|
99 |
+
|
100 |
+
emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
|
101 |
+
|
102 |
+
return emoji_df
|
103 |
+
|
104 |
+
# user activity per month
|
105 |
+
def monthly_timeline(selected_user, df):
|
106 |
+
|
107 |
+
if selected_user != 'Overall':
|
108 |
+
df = df[df['User'] == selected_user]
|
109 |
+
|
110 |
+
timeline = df.groupby(['Year', 'Month_num', 'Month']).count()[
|
111 |
+
'Message'].reset_index()
|
112 |
+
|
113 |
+
# get month and year
|
114 |
+
time = []
|
115 |
+
for i in range(timeline.shape[0]):
|
116 |
+
time.append(timeline['Month'][i] + "-" + str(timeline['Year'][i]))
|
117 |
+
|
118 |
+
timeline['Time'] = time
|
119 |
+
|
120 |
+
return timeline
|
121 |
+
|
122 |
+
# activity per month
|
123 |
+
def monthly_activity(selected_user, df):
|
124 |
+
|
125 |
+
if selected_user != 'Overall':
|
126 |
+
df = df[df['User'] == selected_user]
|
127 |
+
|
128 |
+
return df['Month'].value_counts()
|
129 |
+
|
130 |
+
# activity per week
|
131 |
+
def weekly_activity(selected_user, df):
|
132 |
+
|
133 |
+
if selected_user != 'Overall':
|
134 |
+
df = df[df['User'] == selected_user]
|
135 |
+
|
136 |
+
return df['Day_name'].value_counts()
|