Spaces:

Mohit-321
/

WhatsappchatAnalyzer

Runtime error

App Files Files Community

Mohit-321 commited on Mar 10, 2023

Commit

e5c2ee1

•

1 Parent(s): e57fc6b

Upload 4 files

Browse files

Files changed (4) hide show

app.py +157 -0
helper.py +134 -0
preprocessor.py +111 -0
requirements.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import streamlit as st
+from transformers import pipeline
+from transformers import AutoTokenizer
+from transformers import AutoModelForSequenceClassification
+import warnings
+warnings.filterwarnings("ignore")
+import nltk
+nltk.download('all')
+import matplotlib.pyplot as plt
+import helper
+import preprocessor
+from mtranslate import translate
+import pandas as pd
+import os
+from gtts import gTTS
+import base64
+import torch
+import seaborn as sns
+st.sidebar.title("Whatsapp Chat analyzer")
+uploaded_file= st.sidebar.file_uploader("Choose a file")
+if uploaded_file is not None:
+    bytes_data = uploaded_file.getvalue()
+    data=bytes_data.decode("utf-8")
+    df_new= preprocessor.preprocess(data)
+    user_list= df_new['users'].unique().tolist()
+    user_list.sort()
+    user_list.insert(0,"Group analysis")
+    selected_user=st.sidebar.selectbox("show analysis wrt",user_list)
+    if st.sidebar.button("Show Analysis"):
+        num_messages,words,num_links=helper.fetch_stats(selected_user,df_new)
+        st.title("Top Statistics")
+        col1,col2,col3=st.columns(3)
+        with col1:
+            st.header("Total Messages")
+            st.title(num_messages)
+        with col2:
+            st.header("Total Words")
+            st.title(words)
+        with col3:
+            st.header("Links Shared")
+            st.title(num_links)
+        #Monthly Timeline
+        st.title("Montly Timeline")
+        timeline=helper.monthly_timeline(selected_user,df_new)
+        fig,ax=plt.subplots()
+        ax.plot(timeline['time'], timeline['message'])
+        plt.figure(figsize=(10, 8))
+        plt.xticks(rotation='vertical')
+        st.pyplot(fig)
+        #Daily Timeline
+        st.title("Daily Timeline")
+        daily_timeline = helper.Daily_timeline(selected_user, df_new)
+        fig, ax = plt.subplots()
+        ax.plot(daily_timeline['Date'], daily_timeline['message'],color='black')
+        plt.xticks(rotation='vertical')
+        st.pyplot(fig)
+        st.title("Activity Map")
+        col1,col2=st.columns(2)
+        with col1:
+            st.header("Most busy day")
+            busy_day=helper.week_activity_map(selected_user, df_new)
+            fig,ax=plt.subplots()
+            ax.bar(busy_day.index,busy_day.values)
+            plt.xticks(rotation='vertical')
+            st.pyplot(fig)
+        with col2:
+            st.header("Most busy Month")
+            busy_day = helper.month_activity_map(selected_user, df_new)
+            fig, ax = plt.subplots()
+            ax.bar(busy_day.index, busy_day.values)
+            plt.xticks(rotation='vertical')
+            st.pyplot(fig)
+        st.title("Weekly Activity Map")
+        Activity_heatmap=helper.activity_heatmap(selected_user,df_new)
+        fig,ax=plt.subplots()
+        ax=sns.heatmap(Activity_heatmap)
+        st.pyplot(fig)
+        if selected_user == "Group analysis":
+            st.title("Most busy user")
+            x,new_df=helper.most_busy_users(df_new)
+            fig,ax=plt.subplots()
+            col1,col2=st.columns(2)
+            with col1:
+                ax.bar(x.index, x.values)
+                plt.xticks(rotation='vertical')
+                st.pyplot(fig)
+            with col2:
+                st.dataframe(new_df)
+        st.title("Chat Sentiment Analysis")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.header("Positive")
+            pos_words = helper.pos_words(selected_user, df_new)
+            st.dataframe(pos_words)
+        with col2:
+            st.header("Negative")
+            neg_words = helper.neg_words(selected_user, df_new)
+            st.dataframe(neg_words)
+        with col3:
+            st.header("Neutral")
+            neu_words = helper.neu_words(selected_user, df_new)
+            st.dataframe(neu_words)
+        st.title("Word cloud")
+        df_wc = helper.word_cloud(selected_user, df_new)
+        fig, ax = plt.subplots()
+        ax.imshow(df_wc)
+        plt.axis('off')
+        st.pyplot(fig)
+        st.title("Most Common Words")
+        most_common_df=helper.most_common_words(selected_user,df_new)
+        fig,ax=plt.subplots()
+        ax.barh(most_common_df[0],most_common_df[1])
+        st.pyplot(fig)
+        st.dataframe(most_common_df)
+        emoji_df=helper.emoji_helper(selected_user,df_new)
+        st.title("Emoji Analysis")
+        st.dataframe(emoji_df)
+st.title("Sentiment Analysis")
+@st.cache(allow_output_mutation=True)
+def get_model():
+    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
+    return tokenizer,model
+tokenizer, model = get_model()
+user_input = st.text_area('Enter Text to Analyze')
+button = st.button("Analyze")
+sent_pipeline = pipeline("sentiment-analysis")
+if user_input and button:
+    test_sample = tokenizer([user_input], padding=True, truncation=True, max_length=512, return_tensors='pt')
+    # test_sample
+    output = model(**test_sample)
+    st.write("Prediction: ", sent_pipeline(user_input))
+    showWarningOnDirectExecution = False

helper.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import matplotlib.pyplot as plt
+from urlextract import URLExtract
+from collections import Counter
+from wordcloud import WordCloud, STOPWORDS ,ImageColorGenerator
+import pandas as pd
+import matplotlib.pylab as plt
+import PIL.Image
+import numpy as np
+import emoji
+extract=URLExtract()
+def fetch_stats(selected_user,df):
+    if selected_user!= "Group analysis":
+        df=df[df['users']==selected_user]
+    num_messages = df.shape[0]
+    words = []
+    for message in df['message']:
+        words.extend(message.split())
+    links=[]
+    for message in df['message']:
+        links.extend(extract.find_urls(message))
+    return num_messages, len(words),len(links)
+def most_busy_users(df):
+    x = df['users'].value_counts().head()
+    df=round((df['users'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(
+        columns={'index': 'name', 'user': 'percent'})
+    return x,df
+def most_common_words(selected_user,df):
+    f = open('stop_hinglish.txt', 'r')
+    stop_words = f.read()
+    if selected_user != "Group analysis":
+        df = df[df['users'] == selected_user]
+    temp = df[df['users'] != 'group_notification']
+    temp = temp[temp['message'] != '<Media omitted>\n']
+    words = []
+    for message in temp['message']:
+        for word in message.lower().split():
+            if word not in stop_words:
+                words.append(word)
+    most_common_df=pd.DataFrame(Counter(words).most_common(30))
+    return most_common_df
+def word_cloud(selected_user,df):
+    if selected_user != "Group analysis":
+        df = df[df['users'] == selected_user]
+    stopwords = set('STOPWORDS')
+        # wordcloud
+    wordcloud = WordCloud(stopwords=stopwords, background_color="Black").generate(''.join(df['message']))
+    plt.figure(figsize=(10, 8), facecolor='k')
+    plt.imshow(wordcloud, interpolation='bilinear')
+    plt.show()
+    return wordcloud
+def emoji_helper(selected_user,df):
+    if selected_user != "Group analysis":
+        df = df[df['users'] == selected_user]
+    emojis = []
+    for message in df['message']:
+        emojis.extend([c for c in message if c in emoji.EMOJI_DATA.keys()])
+    emoji_df=pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
+    return emoji_df
+def monthly_timeline(selected_user,df):
+    if selected_user != "Group analysis":
+        df = df[df['users'] == selected_user]
+    timeline = df.groupby(['year', 'Month_name', 'Month']).count()['message'].reset_index()
+    time = []
+    for i in range(timeline.shape[0]):
+        time.append(timeline['Month_name'][i] + "-" + str(timeline['year'][i]))
+    timeline['time'] = time
+    return timeline
+def Daily_timeline(selected_user,df):
+    if selected_user != "Group analysis":
+        df = df[df['users'] == selected_user]
+    daily_timeline = df.groupby('Date').count()['message'].reset_index()
+    return daily_timeline
+def week_activity_map(selected_user,df):
+    if selected_user != "Group analysis":
+        df = df[df['users'] == selected_user]
+    return df['Day_name'].value_counts()
+def month_activity_map(selected_user,df):
+    if selected_user != "Group analysis":
+        df = df[df['users'] == selected_user]
+    return df['Month_name'].value_counts()
+def activity_heatmap(selected_user,df):
+    if selected_user != "Group analysis":
+        df = df[df['users'] == selected_user]
+    Activity_heatmap= df.pivot_table(index='Day_name', columns='period', values='message', aggfunc='count').fillna(0)
+    return Activity_heatmap
+def pos_words(selected_user,df):
+    if selected_user != "Group analysis":
+        df = df[df['users'] == selected_user]
+    pos_word = df[df['vader_Analysis'] == 'Positive']
+    pos_word = pos_word.pop('message')
+    return pos_word
+def neg_words(selected_user,df):
+    if selected_user != "Group analysis":
+        df = df[df['users'] == selected_user]
+    neg_word = df[df['Analysis'] == 'Negative']
+    neg_word = neg_word.pop('message')
+    return neg_word
+def neu_words(selected_user,df):
+    if selected_user != "Group analysis":
+        df = df[df['users'] == selected_user]
+    neu_word = df[df['vader_Analysis'] == 'Neutral']
+    neu_word = neu_word.pop('message')
+    return neu_word

preprocessor.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import pandas as pd
+import re
+from textblob import TextBlob
+import numpy as np
+import nltk
+import nltk.data
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
+from tqdm.notebook import tqdm
+sia=SentimentIntensityAnalyzer()
+nltk.download('vader_lexicon')
+def preprocess(data):
+    pattern ='\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
+    messages = re.split(pattern, data)[1:]
+    dates = re.findall(pattern, data)
+    df = pd.DataFrame({'user_message': messages, 'message_date': dates})
+    df['message_date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %H:%M - ')
+    df.rename(columns={'message_date': 'date'}, inplace=True)
+    users = []
+    messages = []
+    for message in df['user_message']:
+        entry = re.split('([\w\W]+?):\s', message)
+        if entry[1:]:
+            users.append(entry[1])
+            messages.append(entry[2])
+        else:
+            users.append('group_notification')
+            messages.append(entry[0])
+    df['users'] = users
+    df['message'] = messages
+    df.drop(columns=['user_message'], inplace=True)
+    df['year'] = df['date'].dt.year
+    df['day'] = df['date'].dt.day
+    df['hour'] = df['date'].dt.hour
+    df['minute'] = df['date'].dt.minute
+    df['Day_name'] = df['date'].dt.day_name()
+    df['Date']=df['date'].dt.date
+    df['Month'] = df['date'].dt.month
+    df['Month_name'] = df['date'].dt.month_name()
+    period = []
+    for hour in df[['Day_name', 'hour']]['hour']:
+        if hour == 23:
+            period.append(str(hour) + "-" + str('00'))
+        elif hour == 0:
+            period.append(str('00') + "-" + str(hour + 1))
+        else:
+            period.append(str(hour) + "-" + str(hour + 1))
+    df['period']=period
+    temp = df[df['users'] != 'group_notification']
+    temp = temp[temp['message'] != '<Media omitted>\n']
+    temp.replace("", np.nan, inplace=True)
+    temp = temp.dropna()
+    def cleanTxt(text):
+        text = re.sub(r'@[A-Za-z0-9]+', '', text)
+        text = re.sub(r'#', '', text)
+        text = text.replace('\n', "")
+        return text
+    temp['message'] = temp['message'].apply(cleanTxt)
+    temp['users'] = temp['users'].apply(cleanTxt)
+    res = {}
+    for i, row in tqdm(temp.iterrows(), total=len(temp)):
+        text = row['message']
+        myid = row['users']
+        res[myid] = sia.polarity_scores(text)
+    vaders = pd.DataFrame(res).T
+    vaders = vaders.reset_index().rename(columns={'index': 'users'})
+    vaders = vaders.merge(temp, how="right")
+    vaders_new = vaders.pop('message')
+    vaders_new = pd.DataFrame(vaders_new)
+    vaders.insert(1, "message", vaders_new['message'])
+    def getSubjectivity(text):
+        return TextBlob(text).sentiment.subjectivity
+    def getPolarity(text):
+        return TextBlob(text).sentiment.polarity
+    vaders['Subjectivity'] = vaders['message'].apply(getSubjectivity)
+    vaders['Polarity'] = vaders['message'].apply(getPolarity)
+    def getAnalysis(score):
+        if score < 0:
+            return 'Negative'
+        if score == 0:
+            return 'Neutral'
+        else:
+            return 'Positive'
+    vaders['Analysis'] = vaders['Polarity'].apply(getAnalysis)
+    def getAnalysis(score):
+        if score <= 0:
+            return 'Negative'
+        if score < 0.2960:
+            return 'Neutral'
+        else:
+            return 'Positive'
+    vaders['vader_Analysis'] = vaders['compound'].apply(getAnalysis)
+    return vaders

requirements.txt ADDED Viewed

Binary file (7.22 kB). View file