smith2020 commited on
Commit
5221e66
·
1 Parent(s): 630e844

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +241 -1
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from io import StringIO
4
- import precessing_data,helper
5
  import matplotlib.pyplot as plt
6
  from collections import Counter
7
  import seaborn as sns
@@ -12,6 +12,246 @@ summarizer = pipeline("summarization")
12
 
13
  st.sidebar.title("Whatapp Chat Analysis")
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  uploaded_file = st.sidebar.file_uploader("Choose a file")
 
1
  import streamlit as st
2
  import pandas as pd
3
  from io import StringIO
4
+ #import precessing_data,helper
5
  import matplotlib.pyplot as plt
6
  from collections import Counter
7
  import seaborn as sns
 
12
 
13
  st.sidebar.title("Whatapp Chat Analysis")
14
 
15
+ import re
16
+ import pandas as pd
17
+
18
+
19
+ def preprocess(data):
20
+ pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
21
+
22
+ messages = re.split(pattern, data)[1:]
23
+ dates = re.findall(pattern, data)
24
+
25
+ df = pd.DataFrame({'user_message': messages, 'message_date': dates})
26
+ # convert date type
27
+ df['date'] = pd.to_datetime(df['message_date'], format='%d/%m/%Y, %H:%M - ')
28
+
29
+ users = []
30
+ messages = []
31
+
32
+ for message in df['user_message']:
33
+ entry = re.split('([\w\W]+?):\s', message)
34
+ if entry[1:]: # user name
35
+ users.append(entry[1])
36
+ messages.append(" ".join(entry[2:]))
37
+ else:
38
+ users.append('group_notification')
39
+ messages.append(entry[0])
40
+
41
+ df['user'] = users
42
+ df['message'] = messages
43
+ df.drop(columns=['user_message'], inplace=True)
44
+
45
+ df['only_date'] = df['date'].dt.date
46
+ df['year'] = df['date'].dt.year
47
+ df['month_num'] = df['date'].dt.month
48
+ df['month'] = df['date'].dt.month_name()
49
+ df['day'] = df['date'].dt.day
50
+ df['day_name'] = df['date'].dt.day_name()
51
+ df['hour'] = df['date'].dt.hour
52
+ df['minute'] = df['date'].dt.minute
53
+
54
+ period = []
55
+ for hour in df[['day_name', 'hour']]['hour']:
56
+ if hour == 23:
57
+ period.append(str(hour) + "-" + str('00'))
58
+ elif hour == 0:
59
+ period.append(str('00') + "-" + str(hour + 1))
60
+ else:
61
+ period.append(str(hour) + "-" + str(hour + 1))
62
+
63
+ df['period'] = period
64
+
65
+ return df
66
+
67
+
68
+
69
+ from urlextract import URLExtract
70
+ import pandas as pd
71
+ from collections import Counter
72
+ ex=URLExtract()
73
+ from wordcloud import WordCloud, STOPWORDS
74
+ import emoji
75
+
76
+
77
+
78
+
79
+ def fetch_stats(selected_user,df):
80
+ if selected_user != "Over All":
81
+ df=df[df["user"] == selected_user]
82
+
83
+ num_meassage = df.shape[0]
84
+ v = []
85
+ for i in df["message"]:
86
+ v.extend(i.split())
87
+
88
+ #num of media
89
+ media= df[df["message"]=="<Media omitted>\n"].shape[0]
90
+
91
+ # for links
92
+ links = []
93
+ for i in df["message"]:
94
+ links.extend(ex.find_urls(i))
95
+
96
+ return num_meassage,len(v),media,len(links)
97
+
98
+ #Most Busy Users
99
+ def m_b_u(df):
100
+ x=df["user"].value_counts().head()
101
+ # Most Busy Users Presentage
102
+ dl = round((df["user"].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(
103
+ columns={"index": "name", "user": "presentage"})
104
+ return x,dl
105
+
106
+
107
+ #creating wordcloud
108
+
109
+ def create_wordcloud(selected_user,df):
110
+ if selected_user != "Over All":
111
+ df=df[df["user"] == selected_user]
112
+ f = open('stop_hinglish.txt','r')
113
+ stop_words = f.read()
114
+
115
+ temp = df[df['user'] != 'group_notification']
116
+ temp = temp[temp['message'] != '<Media omitted>\n']
117
+
118
+ def remove_stop_words(message):
119
+ y = []
120
+ for word in message.lower().split():
121
+ if word not in stop_words:
122
+ y.append(word)
123
+ return " ".join(y)
124
+
125
+ wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')
126
+ temp['message'] = temp['message'].apply(remove_stop_words)
127
+ df_wc = wc.generate(temp['message'].str.cat(sep=" "))
128
+ return df_wc
129
+
130
+
131
+ def most_common_words(selected_user,df):
132
+ if selected_user != "Over All":
133
+ df=df[df["user"] == selected_user]
134
+ f = open('stop_hinglish.txt','r')
135
+ stop_words = f.read()
136
+
137
+
138
+
139
+ temp = df[df['user'] != 'group_notification']
140
+ temp = temp[temp['message'] != '<Media omitted>\n']
141
+
142
+ words = []
143
+ for message in temp['message']:
144
+ for word in message.lower().split():
145
+ if word not in stop_words:
146
+ words.append(word)
147
+
148
+ most_common_df = pd.DataFrame(Counter(words).most_common(20))
149
+ return most_common_df
150
+
151
+
152
+
153
+
154
+ def emoji_helper(selected_user,df):
155
+ if selected_user != "Over All":
156
+ df=df[df["user"] == selected_user]
157
+
158
+ emojis = []
159
+ for message in df['message']:
160
+ emojis.extend([c for c in message if c in emoji.EMOJI_DATA])
161
+
162
+ emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
163
+
164
+ return emoji_df
165
+
166
+
167
+
168
+
169
+ def time_line(selected_user,df):
170
+ if selected_user != "Over All":
171
+ df=df[df["user"] == selected_user]
172
+
173
+ time_line = df.groupby(["year", "month"]).count()["message"].reset_index()
174
+ t = []
175
+ for i in range(time_line.shape[0]):
176
+ t.append(time_line["month"][i] + "- " + str(time_line["year"][i]))
177
+
178
+ time_line["time_year"] = t
179
+
180
+ return time_line
181
+
182
+ def daily_timeline(selected_user, df):
183
+ if selected_user != "Over All":
184
+ df = df[df["user"] == selected_user]
185
+
186
+ daily_timeline = df.groupby('only_date').count()['message'].reset_index()
187
+
188
+ return daily_timeline
189
+
190
+
191
+
192
+ def week_activity_map(selected_user, df):
193
+ if selected_user != "Over All":
194
+ df = df[df["user"] == selected_user]
195
+
196
+ return df['day_name'].value_counts()
197
+
198
+
199
+ def month_activity_map(selected_user, df):
200
+ if selected_user != "Over All":
201
+ df = df[df["user"] == selected_user]
202
+
203
+ return df['month'].value_counts()
204
+
205
+
206
+
207
+
208
+
209
+ def activity_heatmap(selected_user, df):
210
+ if selected_user != "Over All":
211
+ df = df[df["user"] == selected_user]
212
+
213
+ user_heatmap = df.pivot_table(index='day_name', columns='period', values='message', aggfunc='count').fillna(0)
214
+
215
+ return user_heatmap
216
+
217
+
218
+ # date to the message
219
+
220
+ from urlextract import URLExtract
221
+
222
+
223
+ def d_message(selected_user, df):
224
+ if selected_user != "Over All":
225
+ df = df[df["user"] == selected_user]
226
+ df = df.groupby('user')
227
+ df = df.get_group(selected_user)
228
+
229
+ import datetime
230
+ Previous_Date = datetime.datetime.today() - datetime.timedelta(days=1)
231
+
232
+ now = Previous_Date
233
+ now = str(now)
234
+ now = now[:10]
235
+
236
+ c = URLExtract() # object
237
+ #filtered_df = df.loc[(df['date'] == now)]
238
+ filtered_df = df.loc[(df['date'] >= '2023-01-27')
239
+ & (df['date'] < '2023-01-30')]
240
+ d = []
241
+ for i in filtered_df["message"]:
242
+ if c.find_urls(i) or i == '<Media omitted>\n' or i == 'This message was deleted\n':
243
+ continue
244
+ " ".join(i)
245
+ d.append(i[0:-1])
246
+ if selected_user == "Over All":
247
+ d = " ".join(d)
248
+ return d
249
+
250
+
251
+
252
+
253
+
254
+
255
 
256
 
257
  uploaded_file = st.sidebar.file_uploader("Choose a file")