imdebamrita commited on
Commit
72a61e9
1 Parent(s): 9a2fa71

Initial commit

Browse files
Files changed (1) hide show
  1. helper.py +175 -1
helper.py CHANGED
@@ -4,4 +4,178 @@ from collections import Counter
4
  import pandas as pd
5
  import emoji
6
 
7
- extractor = URLExtract()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import pandas as pd
5
  import emoji
6
 
7
+ extractor = URLExtract()
8
+
9
+
10
+ def fetch_states(selected_user, df):
11
+
12
+ if selected_user != 'Overall':
13
+ df = df[df['user'] == selected_user]
14
+
15
+ # 1. Number of messages
16
+ num_messages = df.shape[0]
17
+
18
+ # 2. Number of words
19
+ words = []
20
+ for message in df['message']:
21
+ words.extend(message.split())
22
+
23
+ # 3. Number of media
24
+ num_media_messages = df[df['message'] == '<Media omitted>\n'].shape[0]
25
+
26
+ # 4. Number of Links
27
+ links = []
28
+ for message in df['message']:
29
+ links.extend(extractor.find_urls(message))
30
+
31
+ return num_messages, len(words), num_media_messages, len(links)
32
+
33
+
34
+ def monthly_timeline(selected_user, df):
35
+ if selected_user != 'Overall':
36
+ df = df[df['user'] == selected_user]
37
+
38
+ df['month_num'] = df['date'].dt.month
39
+ timeline = df.groupby(['year', 'month_num', 'month']).count()[
40
+ 'message'].reset_index()
41
+
42
+ time = []
43
+ for i in range(timeline.shape[0]):
44
+ time.append(timeline['month'][i] + '-' + str(timeline['year'][i]))
45
+ timeline['time'] = time
46
+
47
+ return timeline.rename(columns={'message': 'Message', 'time': 'Timeline'})
48
+
49
+
50
+ def daily_timeline(selected_user, df):
51
+ if selected_user != 'Overall':
52
+ df = df[df['user'] == selected_user]
53
+
54
+ daily_timeline = df.groupby('only_date').count()['message'].reset_index()
55
+ daily_timeline = daily_timeline.rename(
56
+ columns={'only_date': 'Date', 'message': 'Message'})
57
+
58
+ return daily_timeline
59
+
60
+
61
+ def week_activity_map(selected_user, df):
62
+ if selected_user != 'Overall':
63
+ df = df[df['user'] == selected_user]
64
+
65
+ week_activity = df['day_name'].value_counts().reset_index()
66
+ week_activity = week_activity.rename(
67
+ columns={'day_name': 'Day', 'count': "Count"})
68
+
69
+ return week_activity
70
+
71
+
72
+ def month_activity_map(selected_user, df):
73
+ if selected_user != 'Overall':
74
+ df = df[df['user'] == selected_user]
75
+
76
+ month_activity = df['month'].value_counts().reset_index()
77
+ month_activity = month_activity.rename(
78
+ columns={'month': 'Month', 'count': "Count"})
79
+
80
+ return month_activity
81
+
82
+
83
+ def activity_heatmap(selected_user, df):
84
+ if selected_user != 'Overall':
85
+ df = df[df['user'] == selected_user]
86
+
87
+ user_heatmap = df.pivot_table(
88
+ index='day_name', columns='period', values='message', aggfunc='count').fillna(0)
89
+
90
+ user_heatmap = user_heatmap.rename_axis('Day', axis='index')
91
+ user_heatmap = user_heatmap.rename_axis('Time Period', axis='columns')
92
+
93
+ return user_heatmap
94
+
95
+
96
+ def most_active_user(df):
97
+
98
+ temp = df[df['user'] != 'group_notification']
99
+
100
+ x = (temp['user'].value_counts().head()).reset_index().rename(
101
+ columns={'user': 'User', 'count': 'Count'})
102
+
103
+ per = round(((temp['user'].value_counts() / temp.shape[0]) * 100),
104
+ 2).reset_index().rename(columns={'user': 'User', 'count': 'Percent(%)'})
105
+ return x, per
106
+
107
+
108
+ def create_wordcloud(selected_user, df):
109
+ f = open('stop_ben-hin-eng.txt', 'r')
110
+ stop_words = f.read()
111
+
112
+ if selected_user != 'Overall':
113
+ df = df[df['user'] == selected_user]
114
+
115
+ temp = df[df['user'] != 'group_notification']
116
+ temp = temp[temp['message'] != '<Media omitted>\n']
117
+
118
+ def remove_stop_words(message):
119
+ y = []
120
+ for word in message.lower().split():
121
+ if word not in stop_words:
122
+ y.append(word)
123
+ return " ".join(y)
124
+
125
+ wc = WordCloud(width=500, height=500, min_font_size=10,
126
+ background_color='white')
127
+ temp['message'] = temp['message'].apply(remove_stop_words)
128
+ df_wc = wc.generate(temp['message'].str.cat(sep=" "))
129
+
130
+ return df_wc
131
+
132
+
133
+ def most_common_words(selected_user, df):
134
+ f = open('stop_ben-hin-eng.txt', 'r')
135
+ stop_words = f.read()
136
+
137
+ if selected_user != 'Overall':
138
+ df = df[df['user'] == selected_user]
139
+
140
+ temp = df[df['user'] != 'group_notification']
141
+ temp = temp[temp['message'] != '<Media omitted>\n']
142
+
143
+ words = []
144
+ for message in temp['message']:
145
+ for word in message.lower().split():
146
+ if word not in stop_words:
147
+ words.append(word)
148
+
149
+ most_common_df = pd.DataFrame(Counter(words).most_common(
150
+ 20)).iloc[::-1].rename(columns={0: 'Message', 1: 'Count'})
151
+
152
+ return most_common_df
153
+
154
+
155
+ def emoji_data(selected_user, df):
156
+ if selected_user != 'Overall':
157
+ df = df[df['user'] == selected_user]
158
+
159
+ emojis = []
160
+ for message in df['message']:
161
+ emojis.extend([c for c in message if c in emoji.EMOJI_DATA])
162
+
163
+ emoji_df = pd.DataFrame(Counter(emojis).most_common(
164
+ len(Counter(emojis))))
165
+
166
+ if emojis:
167
+ emoji_df = emoji_df.rename(columns={0: 'Emoji', 1: 'Count'})
168
+
169
+ return emoji_df
170
+
171
+
172
+ def data_timeframe(df):
173
+
174
+ df_first = df.iloc[0]
175
+ df_last = df.iloc[-1]
176
+
177
+ timeframe = str(df_first['day']) + " " + str(df_first['month']) + " " + str(df_first['year']) + \
178
+ " to " + str(df_last['day']) + " " + \
179
+ str(df_last['month']) + " " + str(df_last['year'])
180
+
181
+ return timeframe