Spaces:

irene-glez
/

whatsapp_chat_analyzer_streamlit

Build error

App Files Files Community

Irene G commited on Dec 31, 2022

Commit

ef0c1fa

1 Parent(s): 5398988

first

Browse files

Files changed (3) hide show

app.py +158 -0
preprocess.py +75 -0
stats_graphs.py +136 -0

app.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import pandas as pd
+import numpy as np
+import streamlit as st
+import regex as re
+import matplotlib.pyplot as plt
+import preprocess as prep
+import stats_graphs as sts
+st.sidebar.title('Analiza tu chat de WhatsApp')
+# uploading the file
+uploaded_file = st.sidebar.file_uploader('Por favor, sube aquí el archivo .txt del chat')
+if uploaded_file is not None:
+    # extracting the text in bytes from file
+    bytes_data = uploaded_file.getvalue()
+    # transforming the bytes into text with decoder
+    data = bytes_data.decode('utf-8')
+    # preprocessing the text
+    df = prep.preprocess(data)
+    # displaying the dataframe
+    # st.dataframe(df)
+    # fetch unique users
+    user_list = df['User'].unique().tolist()
+    # removing the group notifications from users list and sort it
+    user_list.remove('Group Notification')
+    user_list.sort()
+    # 'General' at the 0 position of the index, for showcasing the  overall chat group analysis by default
+    user_list.insert(0, 'General')
+    #
+    selected_user = st.sidebar.selectbox(
+        'Mostrar análisis para ', user_list)
+    st.title('Análisis del chat de Whats App para ' + selected_user)
+    if st.sidebar.button('Mostrar análisis'):
+        # getting the stats of the selected user
+        num_messages, num_words, media_omitted, links = sts.fetch_stats(
+            selected_user, df)
+        # we create 4 columns for the stats (messages, words, media and links)
+        col1, col2, col3, col4 = st.beta_columns(4)
+        with col1:
+            st.header('Nº de mensajes')
+            st.title(num_messages)
+        with col2:
+            st.header('Nº de palabras')
+            st.title(num_words)
+        with col3:
+            st.header('Nº de archivos compartidos')
+            st.title(media_omitted)
+        with col4:
+            st.header('Nº de enlaces compartidos')
+            st.title(links)
+        # activity of the users
+        if selected_user == 'General':
+            # dividing the space into two columns:
+            # first one for a bar chart with the top 5 most active users and second one for a df with percentage of total activity
+            st.title('Actividad de los usuarios')
+            activity_count, act_df = sts.fetch_activity_users(df)
+            # two plots, one for each column
+            fig, ax = plt.subplots()
+            col1, col2 = st.beta_columns(2)
+            with col1:
+                ax.bar(activity_count.index, activity_count.values, color='green')
+                plt.xticks(rotation='vertical')
+                st.pyplot(fig)
+            with col2:
+                st.dataframe(act_df)
+        # Word Cloud for selected user
+        st.title('Nube de palabras')
+        df_img = sts.create_wordcloud(selected_user, df)
+        fig, ax = plt.subplots()
+        ax.imshow(df_img)
+        st.pyplot(fig)
+        # most common words in the chat
+        most_common_df = sts.get_common_words(selected_user, df)
+        fig, ax = plt.subplots()
+        ax.barh(most_common_df[0], most_common_df[1])
+        plt.xticks(rotation='vertical')
+        st.title('Palabras más utilizadas')
+        st.pyplot(fig)
+        # Emoji Analysis
+        emoji_df = sts.get_emoji_stats(selected_user, df)
+        emoji_df.columns = ['Emoji', 'Total']
+        st.title('Análisis de emojis')
+        col1, col2 = st.beta_columns(2)
+        # count
+        with col1:
+            st.dataframe(emoji_df)
+        # percentage
+        with col2:
+            emoji_count = list(emoji_df['Total'])
+            perlist = [(i/sum(emoji_count))*100 for i in emoji_count]
+            emoji_df['Porcentaje'] = np.array(perlist)
+            st.dataframe(emoji_df)
+        # Monthly timeline
+        st.title('Actividad por mes')
+        time = sts.monthly_timeline(selected_user, df)
+        fig, ax = plt.subplots()
+        ax.plot(time['Time'], time['Message'], color='blue')
+        plt.xticks(rotation='vertical')
+        plt.tight_layout()
+        st.pyplot(fig)
+        # Activity maps: days and months
+        st.title('Mapas de actividad')
+        col1, col2 = st.beta_columns(2)
+        with col1:
+            st.header('Días de mayor actividad')
+            days = sts.weekly_activity(selected_user, df)
+            fig, ax = plt.subplots()
+            ax.bar(days.index, days.values, color='purple')
+            plt.xticks(rotation='vertical')
+            plt.tight_layout()
+            st.pyplot(fig)
+        with col2:
+            st.header('Meses de mayor actividad')
+            months = sts.monthly_activity(selected_user, df)
+            fig, ax = plt.subplots()
+            ax.bar(months.index, months.values, color='orange')
+            plt.xticks(rotation='vertical')
+            plt.tight_layout()
+            st.pyplot(fig)

preprocess.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import regex as re
+import seaborn as sn
+# function to separate time and date
+def get_time_date(string):
+    string = string.split(',')
+    date, time = string[0], string[1]
+    time = time.split('-')
+    time = time[0].strip()
+    return date+" "+time
+# removing '\n' from the 'Message' column
+def get_string(text):
+    return text.split('\n')[0]
+# final preprocessing function
+def preprocess(data):
+    # splitting date, time and dash at the start of every line of text
+    pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
+    # separate dates from messages
+    messages = re.split(pattern, data)[1:]
+    dates = re.findall(pattern, data)
+    # put both in a dataframe
+    df = pd.DataFrame({'user_messages': messages,
+                       'message_date': dates})
+    df['message_date'] = df['message_date'].apply(
+        lambda text: get_time_date(text))
+    df.rename(columns={'message_date': 'date'}, inplace=True)
+    # separation of the usernamane
+    users = []
+    messages = []
+    for message in df['user_messages']:
+        entry = re.split('([\w\W]+?):\s', message) # extracting the username
+        if entry[1:]:
+            users.append(entry[1])
+            messages.append(entry[2])
+        else:
+            users.append('Group Notification') # the group's notifications don't have linked messages
+            messages.append(entry[0])
+    df['User'] = users
+    df['Message'] = messages
+    df['Message'] = df['Message'].apply(lambda text: get_string(text))
+    df = df.drop(['user_messages'], axis=1)
+    df = df[['Message', 'Date', 'User']]
+    # df = df.rename(columns={'message': 'Message',
+    #                         'date': 'Date'})
+    # splitting and type transformation  for all the info contained in the 'date' column with datetime:
+    df['Only date'] = pd.to_datetime(df['Date']).dt.date
+    df['Year'] = pd.to_datetime(df['Date']).dt.year
+    df['Month_num'] = pd.to_datetime(df['Date']).dt.month
+    df['Month'] = pd.to_datetime(df['Date']).dt.month_name()
+    df['Day'] = pd.to_datetime(df['Date']).dt.day
+    df['Day_name'] = pd.to_datetime(df['Date']).dt.day_name()
+    df['Hour'] = pd.to_datetime(df['Date']).dt.hour
+    df['Minute'] = pd.to_datetime(df['Date']).dt.minute
+    return df

stats_graphs.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import pandas as pd
+from collections import Counter
+from wordcloud import WordCloud
+from urlextract import URLExtract
+from nltk.corpus import stopwords
+import nltk
+nltk.download('stopwords')
+import emoji
+def fetch_stats(selected_user, df):
+    # selecting a specific user
+    if selected_user != 'General':
+        df = df[df['User'] == selected_user]
+    # number of messages
+    num_messages = df.shape[0]
+    # number of words
+    words = []
+    for message in df['Message']:
+        words.extend(message.split())
+    # number of shared media files
+    media_ommitted = df[df['Message'] == '<Media omitted>']
+    # number of shared links
+    links = []
+    extract = URLExtract()
+    for message in df['Message']:
+        links.extend(extract.find_urls(message))
+    return num_messages, len(words), media_ommitted.shape[0], len(links)
+# activity by user
+def fetch_activity_users(df):
+    # top 5 most active users
+    df = df[df['User'] != 'Group Notification']
+    count = df['User'].value_counts().head()
+    # percentage of total activity
+    act_df = pd.DataFrame((df['User'].value_counts()/df.shape[0])*100)
+    return count, act_df
+# Word Cloud
+def create_wordcloud(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['User'] == selected_user]
+    # generate the cloud
+    wc = WordCloud(width=500, height=500,
+                   min_font_size=12, background_color='black')
+    # cut and concatenate the words from the 'Message' column
+    df_wc = wc.generate(df['Message'].str.cat(sep=" "))
+    return df_wc
+# get the 20 most common words
+def get_common_words(selected_user, df):
+    # getting the stopwords
+    # file = open('stop_hinglish.txt', 'r')
+    stopwords = stopwords.words('spanish')
+    # stopwords = stopwords.split('\n')
+    if selected_user != 'Overall':
+        df = df[df['User'] == selected_user]
+    timeline = df[(df['User'] != 'Group Notification') |
+              (df['User'] != '<Media omitted>')]
+    words = []
+    for message in timeline['Message']:
+        for word in message.lower().split():
+            if word not in stopwords:
+                words.append(word)
+    top_20_w = pd.DataFrame(Counter(words).most_common(20))
+    return top_20_w
+# get the most used emojis
+def get_emoji_stats(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['User'] == selected_user]
+    emojis = []
+    for message in df['Message']:
+        emojis.extend([c for c in message if c in emoji.UNICODE_EMOJI['en']])
+    emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
+    return emoji_df
+# user activity per month
+def monthly_timeline(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['User'] == selected_user]
+    timeline = df.groupby(['Year', 'Month_num', 'Month']).count()[
+        'Message'].reset_index()
+    # get month and year
+    time = []
+    for i in range(timeline.shape[0]):
+        time.append(timeline['Month'][i] + "-" + str(timeline['Year'][i]))
+    timeline['Time'] = time
+    return timeline
+# activity per month
+def monthly_activity(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['User'] == selected_user]
+    return df['Month'].value_counts()
+# activity per week
+def weekly_activity(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['User'] == selected_user]
+    return df['Day_name'].value_counts()