darthPanda commited on
Commit
fccd4a8
1 Parent(s): cd80221
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ st.set_page_config(
3
+ page_title="Social Media Sentiment Analyzer", page_icon="📊", layout="wide"
4
+ )
5
+
6
+ import pandas as pd
7
+ import helper_functions as hf
8
+ import plotly.express as px
9
+ import plotly.io as pio
10
+ import plotly
11
+
12
+ # Whenever the search button is clicked, the search_callback function is called
13
+ def search_callback():
14
+ if twitter_agree:
15
+ if len(st.session_state.search_term) == 0:
16
+ st.error("Please enter a search term")
17
+ return
18
+ st.session_state.df = hf.get_tweets(st.session_state.search_term, st.session_state.num_tweets)
19
+ st.session_state.df = hf.get_sentiment(st.session_state.df)
20
+
21
+ def twitter_form():
22
+ with st.form(key="search_form"):
23
+ st.subheader("Search Parameters")
24
+ st.text_input("Enter a User handle (like _@elonmusk_), Hashtag (like _#Bitcoin_) or Topic (like _climate change_)", key="search_term")
25
+ st.slider("Number of tweets", min_value=100, max_value=500, key="num_tweets")
26
+ st.form_submit_button(label="Search", on_click=search_callback)
27
+ st.markdown(
28
+ "Note: it may take a while to load the results, especially with large number of tweets"
29
+ )
30
+
31
+
32
+ with st.sidebar:
33
+ st.title("Social Media Sentiment Analyzer")
34
+ st.subheader("Choose your platform")
35
+ twitter_agree = st.checkbox('Twitter')
36
+
37
+ if twitter_agree:
38
+ twitter_form()
39
+
40
+ st.markdown(
41
+ "<div style='position: fixed; bottom: 0;'>Created by Taaha Bajwa</div>",
42
+ unsafe_allow_html=True,
43
+ )
44
+
45
+ if "df" in st.session_state:
46
+
47
+ def make_dashboard(tweet_df, bar_color, wc_color):
48
+ # first row
49
+ col1, col2, col3 = st.columns([28, 34, 38])
50
+ with col1:
51
+ sentiment_plot = hf.plot_sentiment(tweet_df)
52
+ sentiment_plot.update_layout(height=350, title_x=0.5)
53
+ st.plotly_chart(sentiment_plot, theme=None, use_container_width=True)
54
+ with col2:
55
+ top_unigram = hf.get_top_n_gram(tweet_df, ngram_range=(1, 1), n=10)
56
+ unigram_plot = hf.plot_n_gram(
57
+ top_unigram, title="Top 10 Occuring Words", color=bar_color
58
+ )
59
+ unigram_plot.update_layout(height=350)
60
+ st.plotly_chart(unigram_plot, theme=None, use_container_width=True)
61
+ with col3:
62
+ top_bigram = hf.get_top_n_gram(tweet_df, ngram_range=(2, 2), n=10)
63
+ bigram_plot = hf.plot_n_gram(
64
+ top_bigram, title="Top 10 Occuring Bigrams", color=bar_color
65
+ )
66
+ bigram_plot.update_layout(height=350)
67
+ st.plotly_chart(bigram_plot, theme=None, use_container_width=True)
68
+
69
+ # second row
70
+ col1, col2 = st.columns([60, 40])
71
+ with col1:
72
+
73
+ def sentiment_color(sentiment):
74
+ if sentiment == "Positive":
75
+ return "background-color: #54A24B; color: white"
76
+ elif sentiment == "Negative":
77
+ return "background-color: #FF7F0E"
78
+ else:
79
+ return "background-color: #1F77B4"
80
+
81
+ st.dataframe(
82
+ tweet_df[["Sentiment", "Tweet"]].style.applymap(
83
+ sentiment_color, subset=["Sentiment"]
84
+ ),
85
+ height=350,
86
+ )
87
+ with col2:
88
+ wordcloud = hf.plot_wordcloud(tweet_df, colormap=wc_color)
89
+ st.pyplot(wordcloud)
90
+
91
+ adjust_tab_font = """
92
+ <style>
93
+ button[data-baseweb="tab"] > div[data-testid="stMarkdownContainer"] > p {
94
+ font-size: 20px;
95
+ }
96
+ </style>
97
+ """
98
+
99
+ st.write(adjust_tab_font, unsafe_allow_html=True)
100
+
101
+ tab1, tab2, tab3, tab4 = st.tabs(["All", "Positive 😊", "Negative ☹️", "Neutral 😐"])
102
+ with tab1:
103
+ tweet_df = st.session_state.df
104
+ make_dashboard(tweet_df, bar_color="#1F77B4", wc_color="Blues")
105
+ with tab2:
106
+ tweet_df = st.session_state.df.query("Sentiment == 'Positive'")
107
+ make_dashboard(tweet_df, bar_color="#54A24B", wc_color="Greens")
108
+ with tab3:
109
+ tweet_df = st.session_state.df.query("Sentiment == 'Negative'")
110
+ make_dashboard(tweet_df, bar_color="#FF7F0E", wc_color="Oranges")
111
+ with tab4:
112
+ tweet_df = st.session_state.df.query("Sentiment == 'Neutral'")
113
+ make_dashboard(tweet_df, bar_color="#1F77B4", wc_color="Blues")
helper_functions.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import snscrape.modules.twitter as sntwitter
4
+ import streamlit as st
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
+ from transformers import pipeline
7
+ import plotly.express as px
8
+ import plotly.io as pio
9
+ import matplotlib as mpl
10
+ import matplotlib.pyplot as plt
11
+ from wordcloud import WordCloud
12
+ from PIL import Image
13
+
14
+ @st.cache(allow_output_mutation=True)
15
+ def get_nltk():
16
+ import nltk
17
+ nltk.download(
18
+ ["punkt", "wordnet", "omw-1.4", "averaged_perceptron_tagger", "universal_tagset"]
19
+ )
20
+ return
21
+ get_nltk()
22
+
23
+ from nltk.stem import WordNetLemmatizer
24
+ from nltk.tag import pos_tag
25
+ from nltk.tokenize import word_tokenize
26
+ import re
27
+ from sklearn.feature_extraction.text import CountVectorizer
28
+
29
+ # Create a custom plotly theme and set it as default
30
+ pio.templates["custom"] = pio.templates["plotly_white"]
31
+ pio.templates["custom"].layout.margin = {"b": 25, "l": 25, "r": 25, "t": 50}
32
+ pio.templates["custom"].layout.width = 600
33
+ pio.templates["custom"].layout.height = 450
34
+ pio.templates["custom"].layout.autosize = False
35
+ pio.templates["custom"].layout.font.update(
36
+ {"family": "Arial", "size": 12, "color": "#707070"}
37
+ )
38
+ pio.templates["custom"].layout.title.update(
39
+ {
40
+ "xref": "container",
41
+ "yref": "container",
42
+ "x": 0.5,
43
+ "yanchor": "top",
44
+ "font_size": 16,
45
+ "y": 0.95,
46
+ "font_color": "#353535",
47
+ }
48
+ )
49
+ pio.templates["custom"].layout.xaxis.update(
50
+ {"showline": True, "linecolor": "lightgray", "title_font_size": 14}
51
+ )
52
+ pio.templates["custom"].layout.yaxis.update(
53
+ {"showline": True, "linecolor": "lightgray", "title_font_size": 14}
54
+ )
55
+ pio.templates["custom"].layout.colorway = [
56
+ "#1F77B4",
57
+ "#FF7F0E",
58
+ "#54A24B",
59
+ "#D62728",
60
+ "#C355FA",
61
+ "#8C564B",
62
+ "#E377C2",
63
+ "#7F7F7F",
64
+ "#FFE323",
65
+ "#17BECF",
66
+ ]
67
+ pio.templates.default = "custom"
68
+
69
+ @st.cache(allow_output_mutation=True)
70
+ def get_sentiment_model():
71
+ tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
72
+ model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
73
+ return tokenizer,model
74
+
75
+ tokenizer_sentiment,model_sentiment = get_sentiment_model()
76
+
77
+ def get_tweets(query, max_tweets):
78
+ if query[0] == '@':
79
+ query = query[1:]
80
+ tweets_list = []
81
+
82
+ # Using TwitterSearchScraper to scrape data
83
+ for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:'+query).get_items()):
84
+ if i>max_tweets:
85
+ break
86
+ tweets_list.append([tweet.date, tweet.user.username, tweet.content])
87
+
88
+ # Creating a dataframe from the tweets list above
89
+ tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Username', 'Tweet'])
90
+
91
+ else:
92
+ # Creating list to append tweet data to
93
+ tweets_list = []
94
+
95
+ # Using TwitterSearchScraper to scrape data and append tweets to list
96
+ for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query+' until:').get_items()):
97
+ if i>max_tweets:
98
+ break
99
+ tweets_list.append([tweet.date, tweet.user.username, tweet.content])
100
+
101
+ # Creating a dataframe from the tweets list above
102
+ tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Username', 'Tweet'])
103
+
104
+
105
+ tweets_df['Datetime'] = pd.to_datetime(tweets_df['Datetime'])
106
+ tweets_df['Date'] = tweets_df['Datetime'].dt.date
107
+ tweets_df['Time'] = tweets_df['Datetime'].dt.strftime('%H:%M') #tweets_df['Datetime'].dt.time
108
+ tweets_df.drop('Datetime', axis=1, inplace=True)
109
+ return tweets_df
110
+
111
+ def text_preprocessing(text):
112
+ stopwords = set()
113
+ with open("static/en_stopwords.txt", "r") as file:
114
+ for word in file:
115
+ stopwords.add(word.rstrip("\n"))
116
+ lemmatizer = WordNetLemmatizer()
117
+ try:
118
+ url_pattern = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
119
+ user_pattern = r"@[^\s]+"
120
+ entity_pattern = r"&.*;"
121
+ neg_contraction = r"n't\W"
122
+ non_alpha = "[^a-z]"
123
+ cleaned_text = text.lower()
124
+ cleaned_text = re.sub(neg_contraction, " not ", cleaned_text)
125
+ cleaned_text = re.sub(url_pattern, " ", cleaned_text)
126
+ cleaned_text = re.sub(user_pattern, " ", cleaned_text)
127
+ cleaned_text = re.sub(entity_pattern, " ", cleaned_text)
128
+ cleaned_text = re.sub(non_alpha, " ", cleaned_text)
129
+ tokens = word_tokenize(cleaned_text)
130
+ #print('tokens')
131
+ # provide POS tag for lemmatization to yield better result
132
+ word_tag_tuples = pos_tag(tokens, tagset="universal")
133
+ tag_dict = {"NOUN": "n", "VERB": "v", "ADJ": "a", "ADV": "r"}
134
+ final_tokens = []
135
+
136
+
137
+ for word, tag in word_tag_tuples:
138
+ if len(word) > 1 and word not in stopwords:
139
+ if tag in tag_dict:
140
+ final_tokens.append(lemmatizer.lemmatize(word, tag_dict[tag]))
141
+ else:
142
+ final_tokens.append(lemmatizer.lemmatize(word))
143
+ return " ".join(final_tokens)
144
+ except:
145
+ return np.nan
146
+
147
+ def get_sentiment(df):
148
+ useful_sentence = df['Tweet'].tolist()
149
+ tokenizer = tokenizer_sentiment
150
+ model = model_sentiment
151
+ pipe = pipeline(model="ProsusAI/finbert")
152
+ classifier = pipeline(model="ProsusAI/finbert")
153
+ output=[]
154
+ i=0
155
+ useful_sentence_len = len(useful_sentence)
156
+ for temp in useful_sentence:
157
+ output.extend(classifier(temp))
158
+ i=i+1
159
+
160
+ df_temp = pd.DataFrame.from_dict(output)
161
+ df = pd.concat([df, df_temp], axis=1)
162
+ df = df.rename(columns={'label': 'Sentiment'})
163
+ df['Sentiment'] = df['Sentiment'].replace('positive', 'Positive')
164
+ df['Sentiment'] = df['Sentiment'].replace('negative', 'Negative')
165
+ df['Sentiment'] = df['Sentiment'].replace('neutral', 'Neutral')
166
+ return df
167
+
168
+ def plot_sentiment(tweet_df):
169
+ sentiment_count = tweet_df["Sentiment"].value_counts()
170
+ fig = px.pie(
171
+ values=sentiment_count.values,
172
+ names=sentiment_count.index,
173
+ hole=0.3,
174
+ title="<b>Sentiment Distribution</b>",
175
+ color=sentiment_count.index,
176
+ color_discrete_map={"Positive": "#54A24B", "Negative": "#FF7F0E", "Neutral": "#1F77B4"},
177
+ )
178
+ fig.update_traces(
179
+ textposition="inside",
180
+ texttemplate="%{label}<br>%{value} (%{percent})",
181
+ hovertemplate="<b>%{label}</b><br>Percentage=%{percent}<br>Count=%{value}",
182
+ )
183
+ fig.update_layout(showlegend=False)
184
+ return fig
185
+
186
+ def get_top_n_gram(tweet_df, ngram_range, n=10):
187
+ stopwords = set()
188
+ with open("static/en_stopwords_ngram.txt", "r") as file:
189
+ for word in file:
190
+ stopwords.add(word.rstrip("\n"))
191
+ stopwords = list(stopwords)
192
+ corpus = tweet_df["Tweet"]
193
+ vectorizer = CountVectorizer(
194
+ analyzer="word", ngram_range=ngram_range, stop_words=stopwords
195
+ )
196
+ X = vectorizer.fit_transform(corpus.astype(str).values)
197
+ words = vectorizer.get_feature_names_out()
198
+ words_count = np.ravel(X.sum(axis=0))
199
+ df = pd.DataFrame(zip(words, words_count))
200
+ df.columns = ["words", "counts"]
201
+ df = df.sort_values(by="counts", ascending=False).head(n)
202
+ df["words"] = df["words"].str.title()
203
+ return df
204
+
205
+ def plot_n_gram(n_gram_df, title, color="#54A24B"):
206
+ fig = px.bar(
207
+ # n_gram_df,
208
+ # x="counts",
209
+ # y="words",
210
+ x=n_gram_df.counts,
211
+ y=n_gram_df.words,
212
+ title="<b>{}</b>".format(title),
213
+ text_auto=True,
214
+ )
215
+ fig.update_layout(plot_bgcolor="white")
216
+ fig.update_xaxes(title=None)
217
+ fig.update_yaxes(autorange="reversed", title=None)
218
+ fig.update_traces(hovertemplate="<b>%{y}</b><br>Count=%{x}", marker_color=color)
219
+ return fig
220
+
221
+ def plot_wordcloud(tweet_df, colormap="Greens"):
222
+ stopwords = set()
223
+ with open("static/en_stopwords_ngram.txt", "r") as file:
224
+ for word in file:
225
+ stopwords.add(word.rstrip("\n"))
226
+ cmap = mpl.cm.get_cmap(colormap)(np.linspace(0, 1, 20))
227
+ cmap = mpl.colors.ListedColormap(cmap[10:15])
228
+ mask = np.array(Image.open("static/twitter_mask.png"))
229
+ font = "static/quartzo.ttf"
230
+ #tweet_df["Cleaned_Tweet"] = tweet_df["Tweet"].apply(lambda x: text_preprocessing(x))
231
+ tweet_df["Cleaned_Tweet"] = tweet_df["Tweet"].apply(text_preprocessing)
232
+ #print(tweet_df["Cleaned_Tweet"])
233
+ text = " ".join(tweet_df["Cleaned_Tweet"])
234
+ #print(text)
235
+ wc = WordCloud(
236
+ background_color="white",
237
+ font_path=font,
238
+ stopwords=stopwords,
239
+ max_words=90,
240
+ colormap=cmap,
241
+ mask=mask,
242
+ random_state=42,
243
+ collocations=False,
244
+ min_word_length=2,
245
+ max_font_size=200,
246
+ )
247
+ wc.generate(text)
248
+ fig = plt.figure(figsize=(8, 8))
249
+ ax = fig.add_subplot(1, 1, 1)
250
+ plt.imshow(wc, interpolation="bilinear")
251
+ plt.axis("off")
252
+ plt.title("Wordcloud", fontdict={"fontsize": 16}, fontweight="heavy", pad=20, y=1.0)
253
+ return fig
254
+
255
+
static/en_stopwords.txt ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ if
2
+ his
3
+ our
4
+ they
5
+ can
6
+ into
7
+ an
8
+ same
9
+ himself
10
+ themselves
11
+ her
12
+ are
13
+ such
14
+ through
15
+ each
16
+ when
17
+ just
18
+ yourselves
19
+ hers
20
+ that
21
+ with
22
+ those
23
+ it
24
+ was
25
+ we
26
+ its
27
+ me
28
+ myself
29
+ ve
30
+ and
31
+ itself
32
+ does
33
+ doing
34
+ or
35
+ being
36
+ did
37
+ there
38
+ while
39
+ you
40
+ between
41
+ about
42
+ on
43
+ then
44
+ my
45
+ ourselves
46
+ by
47
+ too
48
+ at
49
+ ours
50
+ here
51
+ had
52
+ been
53
+ as
54
+ the
55
+ has
56
+ off
57
+ these
58
+ other
59
+ your
60
+ him
61
+ herself
62
+ now
63
+ is
64
+ theirs
65
+ whom
66
+ any
67
+ to
68
+ for
69
+ from
70
+ of
71
+ were
72
+ have
73
+ he
74
+ ll
75
+ be
76
+ but
77
+ until
78
+ yours
79
+ this
80
+ again
81
+ re
82
+ do
83
+ so
84
+ some
85
+ both
86
+ yourself
87
+ am
88
+ their
89
+ having
90
+ she
91
+ should
92
+ them
93
+ in
94
+ during
95
+ will
96
+ shall
97
+ could
98
+ would
99
+ ai
100
+ ca
101
+ sha
102
+ wo
static/en_stopwords_ngram.txt ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ out
2
+ ll
3
+ during
4
+ had
5
+ but
6
+ own
7
+ re
8
+ there
9
+ your
10
+ ourselves
11
+ ours
12
+ whom
13
+ an
14
+ if
15
+ as
16
+ against
17
+ with
18
+ in
19
+ so
20
+ his
21
+ were
22
+ by
23
+ at
24
+ theirs
25
+ they
26
+ yourselves
27
+ yours
28
+ are
29
+ you
30
+ could
31
+ our
32
+ some
33
+ ai
34
+ myself
35
+ those
36
+ these
37
+ who
38
+ cannot
39
+ through
40
+ this
41
+ very
42
+ their
43
+ where
44
+ only
45
+ her
46
+ above
47
+ down
48
+ been
49
+ that
50
+ will
51
+ am
52
+ its
53
+ up
54
+ each
55
+ on
56
+ no
57
+ just
58
+ itself
59
+ once
60
+ be
61
+ from
62
+ sha
63
+ himself
64
+ what
65
+ for
66
+ yourself
67
+ me
68
+ while
69
+ being
70
+ is
71
+ more
72
+ here
73
+ over
74
+ my
75
+ would
76
+ why
77
+ she
78
+ he
79
+ ve
80
+ to
81
+ before
82
+ further
83
+ it
84
+ how
85
+ until
86
+ should
87
+ all
88
+ when
89
+ again
90
+ do
91
+ him
92
+ both
93
+ hers
94
+ too
95
+ most
96
+ about
97
+ same
98
+ between
99
+ such
100
+ shall
101
+ has
102
+ which
103
+ can
104
+ having
105
+ few
106
+ the
107
+ because
108
+ did
109
+ into
110
+ than
111
+ them
112
+ we
113
+ does
114
+ below
115
+ was
116
+ of
117
+ off
118
+ now
119
+ after
120
+ under
121
+ ca
122
+ any
123
+ nor
124
+ not
125
+ herself
126
+ ought
127
+ or
128
+ themselves
129
+ other
130
+ doing
131
+ then
132
+ have
133
+ and
134
+ wo
static/quartzo.ttf ADDED
Binary file (116 kB). View file
 
static/twitter_mask.png ADDED