Irene G commited on
Commit
ef0c1fa
·
1 Parent(s): 5398988
Files changed (3) hide show
  1. app.py +158 -0
  2. preprocess.py +75 -0
  3. stats_graphs.py +136 -0
app.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import streamlit as st
4
+ import regex as re
5
+ import matplotlib.pyplot as plt
6
+
7
+ import preprocess as prep
8
+ import stats_graphs as sts
9
+
10
+
11
+ st.sidebar.title('Analiza tu chat de WhatsApp')
12
+
13
+ # uploading the file
14
+
15
+ uploaded_file = st.sidebar.file_uploader('Por favor, sube aquí el archivo .txt del chat')
16
+
17
+ if uploaded_file is not None:
18
+ # extracting the text in bytes from file
19
+ bytes_data = uploaded_file.getvalue()
20
+
21
+ # transforming the bytes into text with decoder
22
+ data = bytes_data.decode('utf-8')
23
+
24
+ # preprocessing the text
25
+ df = prep.preprocess(data)
26
+
27
+ # displaying the dataframe
28
+ # st.dataframe(df)
29
+
30
+ # fetch unique users
31
+ user_list = df['User'].unique().tolist()
32
+
33
+ # removing the group notifications from users list and sort it
34
+ user_list.remove('Group Notification')
35
+ user_list.sort()
36
+
37
+ # 'General' at the 0 position of the index, for showcasing the overall chat group analysis by default
38
+ user_list.insert(0, 'General')
39
+
40
+ #
41
+ selected_user = st.sidebar.selectbox(
42
+ 'Mostrar análisis para ', user_list)
43
+
44
+ st.title('Análisis del chat de Whats App para ' + selected_user)
45
+ if st.sidebar.button('Mostrar análisis'):
46
+
47
+ # getting the stats of the selected user
48
+ num_messages, num_words, media_omitted, links = sts.fetch_stats(
49
+ selected_user, df)
50
+
51
+ # we create 4 columns for the stats (messages, words, media and links)
52
+ col1, col2, col3, col4 = st.beta_columns(4)
53
+
54
+ with col1:
55
+ st.header('Nº de mensajes')
56
+ st.title(num_messages)
57
+
58
+ with col2:
59
+ st.header('Nº de palabras')
60
+ st.title(num_words)
61
+
62
+ with col3:
63
+ st.header('Nº de archivos compartidos')
64
+ st.title(media_omitted)
65
+
66
+ with col4:
67
+ st.header('Nº de enlaces compartidos')
68
+ st.title(links)
69
+
70
+ # activity of the users
71
+ if selected_user == 'General':
72
+
73
+ # dividing the space into two columns:
74
+ # first one for a bar chart with the top 5 most active users and second one for a df with percentage of total activity
75
+
76
+ st.title('Actividad de los usuarios')
77
+ activity_count, act_df = sts.fetch_activity_users(df)
78
+
79
+ # two plots, one for each column
80
+ fig, ax = plt.subplots()
81
+ col1, col2 = st.beta_columns(2)
82
+ with col1:
83
+ ax.bar(activity_count.index, activity_count.values, color='green')
84
+ plt.xticks(rotation='vertical')
85
+ st.pyplot(fig)
86
+
87
+ with col2:
88
+ st.dataframe(act_df)
89
+
90
+ # Word Cloud for selected user
91
+ st.title('Nube de palabras')
92
+ df_img = sts.create_wordcloud(selected_user, df)
93
+ fig, ax = plt.subplots()
94
+ ax.imshow(df_img)
95
+ st.pyplot(fig)
96
+
97
+ # most common words in the chat
98
+ most_common_df = sts.get_common_words(selected_user, df)
99
+ fig, ax = plt.subplots()
100
+ ax.barh(most_common_df[0], most_common_df[1])
101
+ plt.xticks(rotation='vertical')
102
+ st.title('Palabras más utilizadas')
103
+ st.pyplot(fig)
104
+
105
+ # Emoji Analysis
106
+ emoji_df = sts.get_emoji_stats(selected_user, df)
107
+ emoji_df.columns = ['Emoji', 'Total']
108
+
109
+ st.title('Análisis de emojis')
110
+
111
+ col1, col2 = st.beta_columns(2)
112
+
113
+ # count
114
+ with col1:
115
+ st.dataframe(emoji_df)
116
+ # percentage
117
+ with col2:
118
+ emoji_count = list(emoji_df['Total'])
119
+ perlist = [(i/sum(emoji_count))*100 for i in emoji_count]
120
+ emoji_df['Porcentaje'] = np.array(perlist)
121
+ st.dataframe(emoji_df)
122
+
123
+ # Monthly timeline
124
+ st.title('Actividad por mes')
125
+ time = sts.monthly_timeline(selected_user, df)
126
+ fig, ax = plt.subplots()
127
+ ax.plot(time['Time'], time['Message'], color='blue')
128
+ plt.xticks(rotation='vertical')
129
+ plt.tight_layout()
130
+ st.pyplot(fig)
131
+
132
+ # Activity maps: days and months
133
+ st.title('Mapas de actividad')
134
+
135
+ col1, col2 = st.beta_columns(2)
136
+
137
+ with col1:
138
+
139
+ st.header('Días de mayor actividad')
140
+
141
+ days = sts.weekly_activity(selected_user, df)
142
+
143
+ fig, ax = plt.subplots()
144
+ ax.bar(days.index, days.values, color='purple')
145
+ plt.xticks(rotation='vertical')
146
+ plt.tight_layout()
147
+ st.pyplot(fig)
148
+
149
+ with col2:
150
+
151
+ st.header('Meses de mayor actividad')
152
+ months = sts.monthly_activity(selected_user, df)
153
+
154
+ fig, ax = plt.subplots()
155
+ ax.bar(months.index, months.values, color='orange')
156
+ plt.xticks(rotation='vertical')
157
+ plt.tight_layout()
158
+ st.pyplot(fig)
preprocess.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import regex as re
5
+ import seaborn as sn
6
+
7
+
8
+ # function to separate time and date
9
+ def get_time_date(string):
10
+ string = string.split(',')
11
+ date, time = string[0], string[1]
12
+ time = time.split('-')
13
+ time = time[0].strip()
14
+
15
+ return date+" "+time
16
+
17
+ # removing '\n' from the 'Message' column
18
+ def get_string(text):
19
+ return text.split('\n')[0]
20
+
21
+ # final preprocessing function
22
+ def preprocess(data):
23
+ # splitting date, time and dash at the start of every line of text
24
+ pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
25
+
26
+ # separate dates from messages
27
+ messages = re.split(pattern, data)[1:]
28
+ dates = re.findall(pattern, data)
29
+
30
+ # put both in a dataframe
31
+ df = pd.DataFrame({'user_messages': messages,
32
+ 'message_date': dates})
33
+
34
+ df['message_date'] = df['message_date'].apply(
35
+ lambda text: get_time_date(text))
36
+ df.rename(columns={'message_date': 'date'}, inplace=True)
37
+
38
+ # separation of the usernamane
39
+ users = []
40
+ messages = []
41
+
42
+ for message in df['user_messages']:
43
+
44
+ entry = re.split('([\w\W]+?):\s', message) # extracting the username
45
+ if entry[1:]:
46
+ users.append(entry[1])
47
+ messages.append(entry[2])
48
+
49
+ else:
50
+ users.append('Group Notification') # the group's notifications don't have linked messages
51
+ messages.append(entry[0])
52
+
53
+ df['User'] = users
54
+ df['Message'] = messages
55
+
56
+ df['Message'] = df['Message'].apply(lambda text: get_string(text))
57
+ df = df.drop(['user_messages'], axis=1)
58
+
59
+ df = df[['Message', 'Date', 'User']]
60
+
61
+ # df = df.rename(columns={'message': 'Message',
62
+ # 'date': 'Date'})
63
+
64
+ # splitting and type transformation for all the info contained in the 'date' column with datetime:
65
+
66
+ df['Only date'] = pd.to_datetime(df['Date']).dt.date
67
+ df['Year'] = pd.to_datetime(df['Date']).dt.year
68
+ df['Month_num'] = pd.to_datetime(df['Date']).dt.month
69
+ df['Month'] = pd.to_datetime(df['Date']).dt.month_name()
70
+ df['Day'] = pd.to_datetime(df['Date']).dt.day
71
+ df['Day_name'] = pd.to_datetime(df['Date']).dt.day_name()
72
+ df['Hour'] = pd.to_datetime(df['Date']).dt.hour
73
+ df['Minute'] = pd.to_datetime(df['Date']).dt.minute
74
+
75
+ return df
stats_graphs.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from collections import Counter
3
+ from wordcloud import WordCloud
4
+ from urlextract import URLExtract
5
+ from nltk.corpus import stopwords
6
+ import nltk
7
+ nltk.download('stopwords')
8
+ import emoji
9
+
10
+
11
+
12
+
13
+ def fetch_stats(selected_user, df):
14
+
15
+ # selecting a specific user
16
+ if selected_user != 'General':
17
+ df = df[df['User'] == selected_user]
18
+
19
+ # number of messages
20
+ num_messages = df.shape[0]
21
+
22
+ # number of words
23
+ words = []
24
+ for message in df['Message']:
25
+ words.extend(message.split())
26
+
27
+ # number of shared media files
28
+ media_ommitted = df[df['Message'] == '<Media omitted>']
29
+
30
+ # number of shared links
31
+ links = []
32
+ extract = URLExtract()
33
+
34
+ for message in df['Message']:
35
+ links.extend(extract.find_urls(message))
36
+
37
+ return num_messages, len(words), media_ommitted.shape[0], len(links)
38
+
39
+
40
+ # activity by user
41
+ def fetch_activity_users(df):
42
+ # top 5 most active users
43
+ df = df[df['User'] != 'Group Notification']
44
+ count = df['User'].value_counts().head()
45
+
46
+ # percentage of total activity
47
+ act_df = pd.DataFrame((df['User'].value_counts()/df.shape[0])*100)
48
+ return count, act_df
49
+
50
+ # Word Cloud
51
+ def create_wordcloud(selected_user, df):
52
+
53
+ if selected_user != 'Overall':
54
+ df = df[df['User'] == selected_user]
55
+ # generate the cloud
56
+ wc = WordCloud(width=500, height=500,
57
+ min_font_size=12, background_color='black')
58
+ # cut and concatenate the words from the 'Message' column
59
+ df_wc = wc.generate(df['Message'].str.cat(sep=" "))
60
+
61
+ return df_wc
62
+
63
+
64
+ # get the 20 most common words
65
+ def get_common_words(selected_user, df):
66
+
67
+ # getting the stopwords
68
+
69
+ # file = open('stop_hinglish.txt', 'r')
70
+ stopwords = stopwords.words('spanish')
71
+ # stopwords = stopwords.split('\n')
72
+
73
+ if selected_user != 'Overall':
74
+ df = df[df['User'] == selected_user]
75
+
76
+ timeline = df[(df['User'] != 'Group Notification') |
77
+ (df['User'] != '<Media omitted>')]
78
+
79
+ words = []
80
+
81
+ for message in timeline['Message']:
82
+ for word in message.lower().split():
83
+ if word not in stopwords:
84
+ words.append(word)
85
+
86
+ top_20_w = pd.DataFrame(Counter(words).most_common(20))
87
+ return top_20_w
88
+
89
+ # get the most used emojis
90
+ def get_emoji_stats(selected_user, df):
91
+
92
+ if selected_user != 'Overall':
93
+ df = df[df['User'] == selected_user]
94
+
95
+ emojis = []
96
+
97
+ for message in df['Message']:
98
+ emojis.extend([c for c in message if c in emoji.UNICODE_EMOJI['en']])
99
+
100
+ emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
101
+
102
+ return emoji_df
103
+
104
+ # user activity per month
105
+ def monthly_timeline(selected_user, df):
106
+
107
+ if selected_user != 'Overall':
108
+ df = df[df['User'] == selected_user]
109
+
110
+ timeline = df.groupby(['Year', 'Month_num', 'Month']).count()[
111
+ 'Message'].reset_index()
112
+
113
+ # get month and year
114
+ time = []
115
+ for i in range(timeline.shape[0]):
116
+ time.append(timeline['Month'][i] + "-" + str(timeline['Year'][i]))
117
+
118
+ timeline['Time'] = time
119
+
120
+ return timeline
121
+
122
+ # activity per month
123
+ def monthly_activity(selected_user, df):
124
+
125
+ if selected_user != 'Overall':
126
+ df = df[df['User'] == selected_user]
127
+
128
+ return df['Month'].value_counts()
129
+
130
+ # activity per week
131
+ def weekly_activity(selected_user, df):
132
+
133
+ if selected_user != 'Overall':
134
+ df = df[df['User'] == selected_user]
135
+
136
+ return df['Day_name'].value_counts()