from urlextract import URLExtract from wordcloud import WordCloud from collections import Counter import pandas as pd import emoji extract = URLExtract() def fetch_stats(selected_users, df): if selected_users != 'Overall': df = df[df['user'] == selected_users] # fetch the number of messages num_messages = df.shape[0] # fetch the number of words words = [] for i in df['message']: words.extend(i.split()) # fetch the number of media messages num_media_messages = df[df['message'] == 'voice omitted'].shape[0] # fetch the number of link shared links = [] for i in df['message']: links.extend(extract.find_urls(i)) return num_messages, len(words), num_media_messages, len(links) def first_last_msg (df): first_last_msg = df.drop(columns=['date', 'month_num'], axis=1)[1:] first_last_msg = first_last_msg[~first_last_msg['user'].str.startswith('Group')] first_last_msg = first_last_msg[~first_last_msg['message'].str.endswith('group')] first_last_msg = first_last_msg[~first_last_msg['message'].str.endswith('contact')] first_last_msg = first_last_msg[~first_last_msg['message'].str.endswith('you')] first_last_msg ['date'] = first_last_msg [['year', 'month', 'day']].astype(str).agg('-'.join, axis=1) first_last_msg ['time'] = first_last_msg [['hour', 'minute']].astype(str).agg(':'.join, axis=1) first_last_msg = first_last_msg [['user', 'message', 'date', 'time']] return first_last_msg def connected_days(df): first_date = pd.to_datetime(df['date'].min().date()) last_date = pd.to_datetime(df['date'].max().date()) connected_days = (last_date - first_date).days return connected_days def avg_messages_per_day(selected_users,df): if selected_users != 'Overall': df = df[df['user'] == selected_users] first_date = pd.to_datetime(df['date'].min().date()) last_date = pd.to_datetime(df['date'].max().date()) num_days = (last_date - first_date).days num_messages = df.shape[0] avg_messages_per_day = num_messages / num_days return round(avg_messages_per_day) def max_active_day(df): df['only_date'] = df['date'].dt.date daily_timeline = df.groupby(['only_date']).count()['message'].reset_index().rename(columns={'only_date':'Date','message':'Message Counts'}) max_active_day = daily_timeline.nlargest(1, 'Message Counts') return max_active_day def max_active_month(selected_users, df): timeline = monthly_timeline_msg(selected_users, df) max_month = timeline.loc[timeline['Message Count'].idxmax()] year = max_month['Year'] month = max_month['Month'] message_count = max_month['Message Count'] max_active_month = pd.DataFrame({'Year': [year], 'Month': [month], 'Message Count': [message_count]}) return max_active_month def most_busy_users(df): df['user'] = df['user'].astype(str) # Convert 'user' column to string type df['user'] = df['user'].str.strip() # Remove leading/trailing whitespaces if necessary x = df['user'].value_counts().head() df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename( columns={'user': 'Name', 'index': 'User'} ) df = df[~df['User'].str.startswith('Group')] return x, df def create_wordcloud(selected_users, df): f = open('stop_hinglish.txt', 'r') stop_words = f.read() if selected_users != 'Overall': df = df[df['user'] == selected_users] temp = df[df['message'] != 'group_notification'] temp = temp[temp['message'] != '\n'] def remove_stopwords(message): y = [] for i in message.lower().split(): if i not in stop_words: y.append(i) return " ".join(y) wc = WordCloud(width=500, height=500, min_font_size=10, background_color='White') temp['message'] = temp['message'].apply(remove_stopwords) df_wc = wc.generate(temp['message'].str.cat(sep=" ")) return df_wc def most_common_words(selected_users, df): f = open('stop_hinglish.txt', 'r') stop_words = f.read() if selected_users != 'Overall': df = df[df['user'] == selected_users] temp = df[df['message'] != 'group_notification'] temp = temp[temp['message'] != '\n'] words = [] for i in temp['message']: for word in i.lower().split(): # Exclude words that contain emojis if word not in stop_words and not any(c for c in word if c in emoji.UNICODE_EMOJI['en']): words.append(word) most_common_words = pd.DataFrame(Counter(words).most_common(10)).rename(columns={0:'Words',1:'Counts'}).add_suffix(' ') return most_common_words def emoji_helper(selected_users, df): if selected_users != 'Overall': df = df[df['user'] == selected_users] emojis = [] for message in df['message']: emojis.extend([c for c in message if c in emoji.UNICODE_EMOJI['en']]) emoji_counts = Counter(emojis) top_5_emojis = emoji_counts.most_common(5) emoji_df = pd.DataFrame(top_5_emojis, columns=['Emoji ', 'Count ']) return emoji_df def monthly_timeline_msg(selected_users, df): if selected_users != 'Overall': df = df[df['user'] == selected_users] timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index() time = [] for i in range(timeline.shape[0]): time.append(timeline['month'][i]+' - '+str(timeline['year'][i])) timeline = timeline.drop(columns=['month_num'],axis=1) timeline = timeline.rename(columns={'year': 'Year', 'month': 'Month', 'message': 'Message Count'}) return timeline def monthly_timeline(selected_users, df): if selected_users != 'Overall': df = df[df['user'] == selected_users] timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index() time = [] for i in range(timeline.shape[0]): time.append(timeline['month'][i]+' - '+str(timeline['year'][i])) timeline['time'] = time timeline = timeline.drop(columns=['month_num'],axis=1) return timeline def daily_message(selected_users, df): if selected_users != 'Overall': df = df[df['user'] == selected_users] df['only_date'] = df['date'].dt.date daily_message = df.groupby(['only_date']).count()['message'].reset_index().rename(columns={'only_date':'Date','message':'Message Counts'}) return daily_message