whatsapp / helper.py
Saaquib's picture
Update helper.py
ab8371c
from urlextract import URLExtract
from wordcloud import WordCloud
from collections import Counter
import pandas as pd
import emoji
extract = URLExtract()
def fetch_stats(selected_users, df):
if selected_users != 'Overall':
df = df[df['user'] == selected_users]
# fetch the number of messages
num_messages = df.shape[0]
# fetch the number of words
words = []
for i in df['message']:
words.extend(i.split())
# fetch the number of media messages
num_media_messages = df[df['message'] == 'voice omitted'].shape[0]
# fetch the number of link shared
links = []
for i in df['message']:
links.extend(extract.find_urls(i))
return num_messages, len(words), num_media_messages, len(links)
def first_last_msg (df):
first_last_msg = df.drop(columns=['date', 'month_num'], axis=1)[1:]
first_last_msg = first_last_msg[~first_last_msg['user'].str.startswith('Group')]
first_last_msg = first_last_msg[~first_last_msg['message'].str.endswith('group')]
first_last_msg = first_last_msg[~first_last_msg['message'].str.endswith('contact')]
first_last_msg = first_last_msg[~first_last_msg['message'].str.endswith('you')]
first_last_msg ['date'] = first_last_msg [['year', 'month', 'day']].astype(str).agg('-'.join, axis=1)
first_last_msg ['time'] = first_last_msg [['hour', 'minute']].astype(str).agg(':'.join, axis=1)
first_last_msg = first_last_msg [['user', 'message', 'date', 'time']]
return first_last_msg
def connected_days(df):
first_date = pd.to_datetime(df['date'].min().date())
last_date = pd.to_datetime(df['date'].max().date())
connected_days = (last_date - first_date).days
return connected_days
def avg_messages_per_day(selected_users,df):
if selected_users != 'Overall':
df = df[df['user'] == selected_users]
first_date = pd.to_datetime(df['date'].min().date())
last_date = pd.to_datetime(df['date'].max().date())
num_days = (last_date - first_date).days
num_messages = df.shape[0]
avg_messages_per_day = num_messages / num_days
return round(avg_messages_per_day)
def max_active_day(df):
df['only_date'] = df['date'].dt.date
daily_timeline = df.groupby(['only_date']).count()['message'].reset_index().rename(columns={'only_date':'Date','message':'Message Counts'})
max_active_day = daily_timeline.nlargest(1, 'Message Counts')
return max_active_day
def max_active_month(selected_users, df):
timeline = monthly_timeline_msg(selected_users, df)
max_month = timeline.loc[timeline['Message Count'].idxmax()]
year = max_month['Year']
month = max_month['Month']
message_count = max_month['Message Count']
max_active_month = pd.DataFrame({'Year': [year], 'Month': [month], 'Message Count': [message_count]})
return max_active_month
def most_busy_users(df):
df['user'] = df['user'].astype(str) # Convert 'user' column to string type
df['user'] = df['user'].str.strip() # Remove leading/trailing whitespaces if necessary
x = df['user'].value_counts().head()
df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(
columns={'user': 'Name', 'index': 'User'}
)
df = df[~df['User'].str.startswith('Group')]
return x, df
def create_wordcloud(selected_users, df):
f = open('stop_hinglish.txt', 'r')
stop_words = f.read()
if selected_users != 'Overall':
df = df[df['user'] == selected_users]
temp = df[df['message'] != 'group_notification']
temp = temp[temp['message'] != '<Media_omitted>\n']
def remove_stopwords(message):
y = []
for i in message.lower().split():
if i not in stop_words:
y.append(i)
return " ".join(y)
wc = WordCloud(width=500, height=500, min_font_size=10,
background_color='White')
temp['message'] = temp['message'].apply(remove_stopwords)
df_wc = wc.generate(temp['message'].str.cat(sep=" "))
return df_wc
def most_common_words(selected_users, df):
f = open('stop_hinglish.txt', 'r')
stop_words = f.read()
if selected_users != 'Overall':
df = df[df['user'] == selected_users]
temp = df[df['message'] != 'group_notification']
temp = temp[temp['message'] != '<Media_omitted>\n']
words = []
for i in temp['message']:
for word in i.lower().split():
# Exclude words that contain emojis
if word not in stop_words and not any(c for c in word if c in emoji.UNICODE_EMOJI['en']):
words.append(word)
most_common_words = pd.DataFrame(Counter(words).most_common(10)).rename(columns={0:'Words',1:'Counts'}).add_suffix(' ')
return most_common_words
def emoji_helper(selected_users, df):
if selected_users != 'Overall':
df = df[df['user'] == selected_users]
emojis = []
for message in df['message']:
emojis.extend([c for c in message if c in emoji.UNICODE_EMOJI['en']])
emoji_counts = Counter(emojis)
top_5_emojis = emoji_counts.most_common(5)
emoji_df = pd.DataFrame(top_5_emojis, columns=['Emoji ', 'Count '])
return emoji_df
def monthly_timeline_msg(selected_users, df):
if selected_users != 'Overall':
df = df[df['user'] == selected_users]
timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index()
time = []
for i in range(timeline.shape[0]):
time.append(timeline['month'][i]+' - '+str(timeline['year'][i]))
timeline = timeline.drop(columns=['month_num'],axis=1)
timeline = timeline.rename(columns={'year': 'Year', 'month': 'Month', 'message': 'Message Count'})
return timeline
def monthly_timeline(selected_users, df):
if selected_users != 'Overall':
df = df[df['user'] == selected_users]
timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index()
time = []
for i in range(timeline.shape[0]):
time.append(timeline['month'][i]+' - '+str(timeline['year'][i]))
timeline['time'] = time
timeline = timeline.drop(columns=['month_num'],axis=1)
return timeline
def daily_message(selected_users, df):
if selected_users != 'Overall':
df = df[df['user'] == selected_users]
df['only_date'] = df['date'].dt.date
daily_message = df.groupby(['only_date']).count()['message'].reset_index().rename(columns={'only_date':'Date','message':'Message Counts'})
return daily_message