Mohit-321 commited on
Commit
e5c2ee1
1 Parent(s): e57fc6b

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +157 -0
  2. helper.py +134 -0
  3. preprocessor.py +111 -0
  4. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline
3
+ from transformers import AutoTokenizer
4
+ from transformers import AutoModelForSequenceClassification
5
+ import warnings
6
+ warnings.filterwarnings("ignore")
7
+ import nltk
8
+ nltk.download('all')
9
+ import matplotlib.pyplot as plt
10
+ import helper
11
+ import preprocessor
12
+ from mtranslate import translate
13
+ import pandas as pd
14
+ import os
15
+ from gtts import gTTS
16
+ import base64
17
+ import torch
18
+ import seaborn as sns
19
+ st.sidebar.title("Whatsapp Chat analyzer")
20
+
21
+ uploaded_file= st.sidebar.file_uploader("Choose a file")
22
+
23
+ if uploaded_file is not None:
24
+
25
+ bytes_data = uploaded_file.getvalue()
26
+ data=bytes_data.decode("utf-8")
27
+ df_new= preprocessor.preprocess(data)
28
+
29
+ user_list= df_new['users'].unique().tolist()
30
+ user_list.sort()
31
+ user_list.insert(0,"Group analysis")
32
+ selected_user=st.sidebar.selectbox("show analysis wrt",user_list)
33
+ if st.sidebar.button("Show Analysis"):
34
+ num_messages,words,num_links=helper.fetch_stats(selected_user,df_new)
35
+ st.title("Top Statistics")
36
+ col1,col2,col3=st.columns(3)
37
+
38
+ with col1:
39
+ st.header("Total Messages")
40
+ st.title(num_messages)
41
+ with col2:
42
+ st.header("Total Words")
43
+ st.title(words)
44
+ with col3:
45
+ st.header("Links Shared")
46
+ st.title(num_links)
47
+ #Monthly Timeline
48
+ st.title("Montly Timeline")
49
+ timeline=helper.monthly_timeline(selected_user,df_new)
50
+ fig,ax=plt.subplots()
51
+ ax.plot(timeline['time'], timeline['message'])
52
+ plt.figure(figsize=(10, 8))
53
+ plt.xticks(rotation='vertical')
54
+ st.pyplot(fig)
55
+
56
+ #Daily Timeline
57
+ st.title("Daily Timeline")
58
+ daily_timeline = helper.Daily_timeline(selected_user, df_new)
59
+ fig, ax = plt.subplots()
60
+ ax.plot(daily_timeline['Date'], daily_timeline['message'],color='black')
61
+ plt.xticks(rotation='vertical')
62
+ st.pyplot(fig)
63
+
64
+ st.title("Activity Map")
65
+ col1,col2=st.columns(2)
66
+
67
+ with col1:
68
+ st.header("Most busy day")
69
+ busy_day=helper.week_activity_map(selected_user, df_new)
70
+ fig,ax=plt.subplots()
71
+ ax.bar(busy_day.index,busy_day.values)
72
+ plt.xticks(rotation='vertical')
73
+ st.pyplot(fig)
74
+ with col2:
75
+ st.header("Most busy Month")
76
+ busy_day = helper.month_activity_map(selected_user, df_new)
77
+ fig, ax = plt.subplots()
78
+ ax.bar(busy_day.index, busy_day.values)
79
+ plt.xticks(rotation='vertical')
80
+ st.pyplot(fig)
81
+
82
+ st.title("Weekly Activity Map")
83
+ Activity_heatmap=helper.activity_heatmap(selected_user,df_new)
84
+ fig,ax=plt.subplots()
85
+ ax=sns.heatmap(Activity_heatmap)
86
+ st.pyplot(fig)
87
+
88
+ if selected_user == "Group analysis":
89
+ st.title("Most busy user")
90
+ x,new_df=helper.most_busy_users(df_new)
91
+ fig,ax=plt.subplots()
92
+ col1,col2=st.columns(2)
93
+
94
+ with col1:
95
+ ax.bar(x.index, x.values)
96
+ plt.xticks(rotation='vertical')
97
+ st.pyplot(fig)
98
+ with col2:
99
+ st.dataframe(new_df)
100
+
101
+ st.title("Chat Sentiment Analysis")
102
+ col1, col2, col3 = st.columns(3)
103
+
104
+ with col1:
105
+ st.header("Positive")
106
+ pos_words = helper.pos_words(selected_user, df_new)
107
+ st.dataframe(pos_words)
108
+ with col2:
109
+ st.header("Negative")
110
+ neg_words = helper.neg_words(selected_user, df_new)
111
+ st.dataframe(neg_words)
112
+ with col3:
113
+ st.header("Neutral")
114
+ neu_words = helper.neu_words(selected_user, df_new)
115
+ st.dataframe(neu_words)
116
+
117
+
118
+ st.title("Word cloud")
119
+ df_wc = helper.word_cloud(selected_user, df_new)
120
+ fig, ax = plt.subplots()
121
+ ax.imshow(df_wc)
122
+ plt.axis('off')
123
+ st.pyplot(fig)
124
+
125
+ st.title("Most Common Words")
126
+ most_common_df=helper.most_common_words(selected_user,df_new)
127
+ fig,ax=plt.subplots()
128
+ ax.barh(most_common_df[0],most_common_df[1])
129
+ st.pyplot(fig)
130
+ st.dataframe(most_common_df)
131
+
132
+ emoji_df=helper.emoji_helper(selected_user,df_new)
133
+ st.title("Emoji Analysis")
134
+ st.dataframe(emoji_df)
135
+
136
+
137
+ st.title("Sentiment Analysis")
138
+ @st.cache(allow_output_mutation=True)
139
+ def get_model():
140
+ MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
141
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
142
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
143
+ return tokenizer,model
144
+
145
+
146
+ tokenizer, model = get_model()
147
+
148
+ user_input = st.text_area('Enter Text to Analyze')
149
+ button = st.button("Analyze")
150
+
151
+ sent_pipeline = pipeline("sentiment-analysis")
152
+ if user_input and button:
153
+ test_sample = tokenizer([user_input], padding=True, truncation=True, max_length=512, return_tensors='pt')
154
+ # test_sample
155
+ output = model(**test_sample)
156
+ st.write("Prediction: ", sent_pipeline(user_input))
157
+ showWarningOnDirectExecution = False
helper.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ from urlextract import URLExtract
3
+ from collections import Counter
4
+ from wordcloud import WordCloud, STOPWORDS ,ImageColorGenerator
5
+ import pandas as pd
6
+ import matplotlib.pylab as plt
7
+ import PIL.Image
8
+ import numpy as np
9
+ import emoji
10
+
11
+ extract=URLExtract()
12
+ def fetch_stats(selected_user,df):
13
+
14
+ if selected_user!= "Group analysis":
15
+ df=df[df['users']==selected_user]
16
+ num_messages = df.shape[0]
17
+ words = []
18
+ for message in df['message']:
19
+ words.extend(message.split())
20
+
21
+
22
+ links=[]
23
+ for message in df['message']:
24
+ links.extend(extract.find_urls(message))
25
+
26
+ return num_messages, len(words),len(links)
27
+
28
+ def most_busy_users(df):
29
+ x = df['users'].value_counts().head()
30
+ df=round((df['users'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(
31
+ columns={'index': 'name', 'user': 'percent'})
32
+ return x,df
33
+
34
+ def most_common_words(selected_user,df):
35
+ f = open('stop_hinglish.txt', 'r')
36
+ stop_words = f.read()
37
+
38
+ if selected_user != "Group analysis":
39
+ df = df[df['users'] == selected_user]
40
+ temp = df[df['users'] != 'group_notification']
41
+ temp = temp[temp['message'] != '<Media omitted>\n']
42
+
43
+ words = []
44
+
45
+ for message in temp['message']:
46
+ for word in message.lower().split():
47
+ if word not in stop_words:
48
+ words.append(word)
49
+ most_common_df=pd.DataFrame(Counter(words).most_common(30))
50
+ return most_common_df
51
+
52
+ def word_cloud(selected_user,df):
53
+ if selected_user != "Group analysis":
54
+ df = df[df['users'] == selected_user]
55
+
56
+ stopwords = set('STOPWORDS')
57
+
58
+ # wordcloud
59
+ wordcloud = WordCloud(stopwords=stopwords, background_color="Black").generate(''.join(df['message']))
60
+ plt.figure(figsize=(10, 8), facecolor='k')
61
+ plt.imshow(wordcloud, interpolation='bilinear')
62
+ plt.show()
63
+
64
+ return wordcloud
65
+
66
+ def emoji_helper(selected_user,df):
67
+ if selected_user != "Group analysis":
68
+ df = df[df['users'] == selected_user]
69
+ emojis = []
70
+ for message in df['message']:
71
+ emojis.extend([c for c in message if c in emoji.EMOJI_DATA.keys()])
72
+ emoji_df=pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
73
+
74
+ return emoji_df
75
+
76
+ def monthly_timeline(selected_user,df):
77
+ if selected_user != "Group analysis":
78
+ df = df[df['users'] == selected_user]
79
+
80
+ timeline = df.groupby(['year', 'Month_name', 'Month']).count()['message'].reset_index()
81
+ time = []
82
+ for i in range(timeline.shape[0]):
83
+ time.append(timeline['Month_name'][i] + "-" + str(timeline['year'][i]))
84
+ timeline['time'] = time
85
+
86
+ return timeline
87
+ def Daily_timeline(selected_user,df):
88
+ if selected_user != "Group analysis":
89
+ df = df[df['users'] == selected_user]
90
+
91
+ daily_timeline = df.groupby('Date').count()['message'].reset_index()
92
+
93
+ return daily_timeline
94
+
95
+ def week_activity_map(selected_user,df):
96
+ if selected_user != "Group analysis":
97
+ df = df[df['users'] == selected_user]
98
+ return df['Day_name'].value_counts()
99
+
100
+ def month_activity_map(selected_user,df):
101
+ if selected_user != "Group analysis":
102
+ df = df[df['users'] == selected_user]
103
+ return df['Month_name'].value_counts()
104
+
105
+ def activity_heatmap(selected_user,df):
106
+ if selected_user != "Group analysis":
107
+ df = df[df['users'] == selected_user]
108
+
109
+ Activity_heatmap= df.pivot_table(index='Day_name', columns='period', values='message', aggfunc='count').fillna(0)
110
+ return Activity_heatmap
111
+
112
+ def pos_words(selected_user,df):
113
+ if selected_user != "Group analysis":
114
+ df = df[df['users'] == selected_user]
115
+
116
+ pos_word = df[df['vader_Analysis'] == 'Positive']
117
+ pos_word = pos_word.pop('message')
118
+ return pos_word
119
+
120
+ def neg_words(selected_user,df):
121
+ if selected_user != "Group analysis":
122
+ df = df[df['users'] == selected_user]
123
+
124
+ neg_word = df[df['Analysis'] == 'Negative']
125
+ neg_word = neg_word.pop('message')
126
+ return neg_word
127
+
128
+ def neu_words(selected_user,df):
129
+ if selected_user != "Group analysis":
130
+ df = df[df['users'] == selected_user]
131
+
132
+ neu_word = df[df['vader_Analysis'] == 'Neutral']
133
+ neu_word = neu_word.pop('message')
134
+ return neu_word
preprocessor.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ from textblob import TextBlob
4
+ import numpy as np
5
+ import nltk
6
+ import nltk.data
7
+ from nltk.sentiment.vader import SentimentIntensityAnalyzer
8
+ from tqdm.notebook import tqdm
9
+ sia=SentimentIntensityAnalyzer()
10
+ nltk.download('vader_lexicon')
11
+
12
+ def preprocess(data):
13
+ pattern ='\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
14
+
15
+ messages = re.split(pattern, data)[1:]
16
+ dates = re.findall(pattern, data)
17
+ df = pd.DataFrame({'user_message': messages, 'message_date': dates})
18
+ df['message_date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %H:%M - ')
19
+ df.rename(columns={'message_date': 'date'}, inplace=True)
20
+ users = []
21
+ messages = []
22
+ for message in df['user_message']:
23
+ entry = re.split('([\w\W]+?):\s', message)
24
+
25
+ if entry[1:]:
26
+ users.append(entry[1])
27
+ messages.append(entry[2])
28
+
29
+ else:
30
+ users.append('group_notification')
31
+ messages.append(entry[0])
32
+ df['users'] = users
33
+ df['message'] = messages
34
+ df.drop(columns=['user_message'], inplace=True)
35
+ df['year'] = df['date'].dt.year
36
+ df['day'] = df['date'].dt.day
37
+ df['hour'] = df['date'].dt.hour
38
+ df['minute'] = df['date'].dt.minute
39
+ df['Day_name'] = df['date'].dt.day_name()
40
+ df['Date']=df['date'].dt.date
41
+ df['Month'] = df['date'].dt.month
42
+ df['Month_name'] = df['date'].dt.month_name()
43
+
44
+ period = []
45
+ for hour in df[['Day_name', 'hour']]['hour']:
46
+ if hour == 23:
47
+ period.append(str(hour) + "-" + str('00'))
48
+ elif hour == 0:
49
+ period.append(str('00') + "-" + str(hour + 1))
50
+ else:
51
+ period.append(str(hour) + "-" + str(hour + 1))
52
+
53
+ df['period']=period
54
+
55
+ temp = df[df['users'] != 'group_notification']
56
+ temp = temp[temp['message'] != '<Media omitted>\n']
57
+ temp.replace("", np.nan, inplace=True)
58
+ temp = temp.dropna()
59
+
60
+ def cleanTxt(text):
61
+ text = re.sub(r'@[A-Za-z0-9]+', '', text)
62
+ text = re.sub(r'#', '', text)
63
+ text = text.replace('\n', "")
64
+ return text
65
+
66
+ temp['message'] = temp['message'].apply(cleanTxt)
67
+ temp['users'] = temp['users'].apply(cleanTxt)
68
+
69
+ res = {}
70
+ for i, row in tqdm(temp.iterrows(), total=len(temp)):
71
+ text = row['message']
72
+ myid = row['users']
73
+ res[myid] = sia.polarity_scores(text)
74
+
75
+ vaders = pd.DataFrame(res).T
76
+ vaders = vaders.reset_index().rename(columns={'index': 'users'})
77
+ vaders = vaders.merge(temp, how="right")
78
+ vaders_new = vaders.pop('message')
79
+ vaders_new = pd.DataFrame(vaders_new)
80
+ vaders.insert(1, "message", vaders_new['message'])
81
+
82
+ def getSubjectivity(text):
83
+ return TextBlob(text).sentiment.subjectivity
84
+
85
+ def getPolarity(text):
86
+ return TextBlob(text).sentiment.polarity
87
+
88
+ vaders['Subjectivity'] = vaders['message'].apply(getSubjectivity)
89
+ vaders['Polarity'] = vaders['message'].apply(getPolarity)
90
+
91
+ def getAnalysis(score):
92
+ if score < 0:
93
+ return 'Negative'
94
+ if score == 0:
95
+ return 'Neutral'
96
+ else:
97
+ return 'Positive'
98
+
99
+ vaders['Analysis'] = vaders['Polarity'].apply(getAnalysis)
100
+
101
+ def getAnalysis(score):
102
+ if score <= 0:
103
+ return 'Negative'
104
+ if score < 0.2960:
105
+ return 'Neutral'
106
+ else:
107
+ return 'Positive'
108
+
109
+ vaders['vader_Analysis'] = vaders['compound'].apply(getAnalysis)
110
+
111
+ return vaders
requirements.txt ADDED
Binary file (7.22 kB). View file