darthPanda commited on
Commit
d09b322
β€’
1 Parent(s): f746f70
Files changed (4) hide show
  1. app.py +138 -25
  2. helper_functions.py +118 -65
  3. requirements.txt +1 -0
  4. static/yt_mask.png +0 -0
app.py CHANGED
@@ -10,36 +10,68 @@ import plotly.io as pio
10
  import plotly
11
 
12
  # Whenever the search button is clicked, the search_callback function is called
13
- def search_callback():
14
- if twitter_agree:
15
- if len(st.session_state.search_term) == 0:
16
  st.error("Please enter a search term")
17
  return
18
  try:
19
- st.session_state.df = hf.get_tweets(st.session_state.search_term, st.session_state.num_tweets)
20
  st.session_state.df = hf.get_sentiment(st.session_state.df)
21
  except:
22
  st.error("Please enter a valid search term")
23
  return
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def twitter_form():
26
  with st.form(key="search_form"):
27
  st.subheader("Search Parameters")
28
- st.text_input("Enter a User handle (like _@elonmusk_), Hashtag (like _#Bitcoin_) or Topic (like _climate change_)", key="search_term")
29
  st.slider("Number of tweets", min_value=100, max_value=500, key="num_tweets")
30
- st.form_submit_button(label="Search", on_click=search_callback)
31
  st.markdown(
32
  "Note: it may take a while to load the results, especially with large number of tweets"
33
  )
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  with st.sidebar:
37
  st.title("Social Media Sentiment Analyzer")
38
- st.subheader("Choose your platform")
39
- twitter_agree = st.checkbox('Twitter')
 
 
 
 
 
 
 
40
 
41
- if twitter_agree:
42
  twitter_form()
 
 
 
43
 
44
  st.markdown(
45
  "<div style='position: fixed; bottom: 0;'>Created by Taaha Bajwa</div>",
@@ -91,6 +123,56 @@ if "df" in st.session_state:
91
  wordcloud = hf.plot_wordcloud(tweet_df, colormap=wc_color)
92
  st.pyplot(wordcloud)
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  adjust_tab_font = """
95
  <style>
96
  button[data-baseweb="tab"] > div[data-testid="stMarkdownContainer"] > p {
@@ -101,19 +183,50 @@ if "df" in st.session_state:
101
 
102
  st.write(adjust_tab_font, unsafe_allow_html=True)
103
 
104
- try:
105
- tab1, tab2, tab3, tab4 = st.tabs(["All", "Positive 😊", "Negative ☹️", "Neutral 😐"])
106
- with tab1:
107
- tweet_df = st.session_state.df
108
- make_dashboard(tweet_df, bar_color="#1F77B4", wc_color="Blues")
109
- with tab2:
110
- tweet_df = st.session_state.df.query("Sentiment == 'Positive'")
111
- make_dashboard(tweet_df, bar_color="#54A24B", wc_color="Greens")
112
- with tab3:
113
- tweet_df = st.session_state.df.query("Sentiment == 'Negative'")
114
- make_dashboard(tweet_df, bar_color="#FF7F0E", wc_color="Oranges")
115
- with tab4:
116
- tweet_df = st.session_state.df.query("Sentiment == 'Neutral'")
117
- make_dashboard(tweet_df, bar_color="#1F77B4", wc_color="Blues")
118
- except:
119
- st.error("No plots to display.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  import plotly
11
 
12
  # Whenever the search button is clicked, the search_callback function is called
13
+ def search_callback_twitter():
14
+ if platform == "Twitter":
15
+ if len(st.session_state.search_term_twitter) == 0:
16
  st.error("Please enter a search term")
17
  return
18
  try:
19
+ st.session_state.df = hf.get_tweets(st.session_state.search_term_twitter, st.session_state.num_tweets)
20
  st.session_state.df = hf.get_sentiment(st.session_state.df)
21
  except:
22
  st.error("Please enter a valid search term")
23
  return
24
+
25
+ def search_callback_youtube():
26
+ if platform == "Youtube":
27
+ if len(st.session_state.search_term_youtube) == 0:
28
+ st.error("Please enter a valid url")
29
+ return
30
+ try:
31
+ st.session_state.df = hf.get_youtube_comments(st.session_state.search_term_youtube, st.session_state.num_comments)
32
+ st.session_state.df = hf.get_sentiment_youtube(st.session_state.df)
33
+ except:
34
+ st.error("Please enter a valid url")
35
+ return
36
 
37
  def twitter_form():
38
  with st.form(key="search_form"):
39
  st.subheader("Search Parameters")
40
+ st.text_input("Enter a User handle (like _@elonmusk_), Hashtag (like _#Bitcoin_) or Topic (like _climate change_)", key="search_term_twitter")
41
  st.slider("Number of tweets", min_value=100, max_value=500, key="num_tweets")
42
+ st.form_submit_button(label="Search", on_click=search_callback_twitter)
43
  st.markdown(
44
  "Note: it may take a while to load the results, especially with large number of tweets"
45
  )
46
+
47
+ def youtube_form():
48
+ with st.form(key="search_form"):
49
+ st.subheader("Search Parameters")
50
+ st.text_input("Enter a Video link to analyse comments", key="search_term_youtube")
51
+ st.slider("Number of Comments", min_value=100, max_value=500, key="num_comments")
52
+ st.form_submit_button(label="Search", on_click=search_callback_youtube)
53
+ st.markdown(
54
+ "Note: it may take a while to load the results, especially with large number of comments"
55
+ )
56
 
57
 
58
  with st.sidebar:
59
  st.title("Social Media Sentiment Analyzer")
60
+ #st.subheader("Choose your platform")
61
+ platform = st.radio(
62
+ "Choose your platform πŸ‘‡",
63
+ ["Twitter", "Youtube"],
64
+ # key="visibility",
65
+ # label_visibility=st.session_state.visibility,
66
+ # disabled=st.session_state.disabled,
67
+ horizontal=True,
68
+ )
69
 
70
+ if platform == "Twitter":
71
  twitter_form()
72
+
73
+ if platform == "Youtube":
74
+ youtube_form()
75
 
76
  st.markdown(
77
  "<div style='position: fixed; bottom: 0;'>Created by Taaha Bajwa</div>",
 
123
  wordcloud = hf.plot_wordcloud(tweet_df, colormap=wc_color)
124
  st.pyplot(wordcloud)
125
 
126
+
127
+ def make_dashboard_youtube(tweet_df, bar_color, wc_color):
128
+ tweet_df = tweet_df.rename(columns={"Comment": "Tweet"})
129
+ # first row
130
+ col1, col2, col3 = st.columns([28, 34, 38])
131
+ with col1:
132
+ sentiment_plot = hf.plot_sentiment(tweet_df)
133
+ sentiment_plot.update_layout(height=350, title_x=0.5)
134
+ st.plotly_chart(sentiment_plot, theme=None, use_container_width=True)
135
+ with col2:
136
+ top_unigram = hf.get_top_n_gram(tweet_df, ngram_range=(1, 1), n=10)
137
+ unigram_plot = hf.plot_n_gram(
138
+ top_unigram, title="Top 10 Occuring Words", color=bar_color
139
+ )
140
+ unigram_plot.update_layout(height=350)
141
+ st.plotly_chart(unigram_plot, theme=None, use_container_width=True)
142
+ with col3:
143
+ top_bigram = hf.get_top_n_gram(tweet_df, ngram_range=(2, 2), n=10)
144
+ bigram_plot = hf.plot_n_gram(
145
+ top_bigram, title="Top 10 Occuring Bigrams", color=bar_color
146
+ )
147
+ bigram_plot.update_layout(height=350)
148
+ st.plotly_chart(bigram_plot, theme=None, use_container_width=True)
149
+
150
+ # second row
151
+ col1, col2 = st.columns([60, 40])
152
+ with col1:
153
+
154
+ def sentiment_color(sentiment):
155
+ if sentiment == "Positive":
156
+ return "background-color: #54A24B; color: white"
157
+ elif sentiment == "Negative":
158
+ return "background-color: #FF7F0E"
159
+ else:
160
+ return "background-color: #1F77B4"
161
+ tweet_df_temp = tweet_df[["Sentiment", "Tweet"]]
162
+ tweet_df_temp = tweet_df_temp.rename(columns={"Tweet": "Comment"})
163
+ st.dataframe(
164
+ tweet_df_temp[["Sentiment", "Comment"]].style.applymap(
165
+ sentiment_color, subset=["Sentiment"]
166
+ ),
167
+ height=350,
168
+ )
169
+ with col2:
170
+ wordcloud = hf.plot_wordcloud(tweet_df, colormap=wc_color, mask_url='static/yt_mask.png')
171
+ try:
172
+ st.pyplot(wordcloud)
173
+ except:
174
+ st.write("Wordcloud not available for this search term")
175
+
176
  adjust_tab_font = """
177
  <style>
178
  button[data-baseweb="tab"] > div[data-testid="stMarkdownContainer"] > p {
 
183
 
184
  st.write(adjust_tab_font, unsafe_allow_html=True)
185
 
186
+ if platform == "Twitter" and st.session_state.search_term_twitter != "":
187
+ try:
188
+ tab1, tab2, tab3, tab4 = st.tabs(["All", "Positive 😊", "Negative ☹️", "Neutral 😐"])
189
+ with tab1:
190
+ tweet_df = st.session_state.df
191
+ make_dashboard(tweet_df, bar_color="#1F77B4", wc_color="Blues")
192
+ with tab2:
193
+ tweet_df = st.session_state.df.query("Sentiment == 'Positive'")
194
+ make_dashboard(tweet_df, bar_color="#54A24B", wc_color="Greens")
195
+ with tab3:
196
+ tweet_df = st.session_state.df.query("Sentiment == 'Negative'")
197
+ make_dashboard(tweet_df, bar_color="#FF7F0E", wc_color="Oranges")
198
+ with tab4:
199
+ tweet_df = st.session_state.df.query("Sentiment == 'Neutral'")
200
+ make_dashboard(tweet_df, bar_color="#1F77B4", wc_color="Blues")
201
+ except:
202
+ st.error("No plots to display.")
203
+
204
+ elif platform == "Youtube" and st.session_state.search_term_youtube != "":
205
+ try:
206
+ tab1, tab2, tab3, tab4 = st.tabs(["All", "Positive 😊", "Negative ☹️", "Neutral 😐"])
207
+ with tab1:
208
+ tweet_df = st.session_state.df
209
+ if tweet_df.shape[0] > 0:
210
+ make_dashboard_youtube(tweet_df, bar_color="#1F77B4", wc_color="Blues")
211
+ else:
212
+ st.write("No comments found.")
213
+ with tab2:
214
+ tweet_df = st.session_state.df.query("Sentiment == 'Positive'")
215
+ if tweet_df.shape[0] > 0:
216
+ make_dashboard_youtube(tweet_df, bar_color="#54A24B", wc_color="Greens")
217
+ else:
218
+ st.write("No positive comments found.")
219
+ with tab3:
220
+ tweet_df = st.session_state.df.query("Sentiment == 'Negative'")
221
+ if tweet_df.shape[0] > 0:
222
+ make_dashboard_youtube(tweet_df, bar_color="#FF7F0E", wc_color="Oranges")
223
+ else:
224
+ st.write("No negative comments found.")
225
+ with tab4:
226
+ tweet_df = st.session_state.df.query("Sentiment == 'Neutral'")
227
+ if tweet_df.shape[0] > 0:
228
+ make_dashboard_youtube(tweet_df, bar_color="#1F77B4", wc_color="Blues")
229
+ else:
230
+ st.write("No neutral comments found.")
231
+ except:
232
+ st.error("No plots to display.")
helper_functions.py CHANGED
@@ -6,10 +6,15 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
  from transformers import pipeline
7
  import plotly.express as px
8
  import plotly.io as pio
 
9
  import matplotlib as mpl
10
  import matplotlib.pyplot as plt
11
  from wordcloud import WordCloud
12
  from PIL import Image
 
 
 
 
13
 
14
  @st.cache(allow_output_mutation=True)
15
  def get_nltk():
@@ -108,6 +113,45 @@ def get_tweets(query, max_tweets):
108
  tweets_df.drop('Datetime', axis=1, inplace=True)
109
  return tweets_df
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  def text_preprocessing(text):
112
  stopwords = set()
113
  with open("static/en_stopwords.txt", "r") as file:
@@ -127,7 +171,6 @@ def text_preprocessing(text):
127
  cleaned_text = re.sub(entity_pattern, " ", cleaned_text)
128
  cleaned_text = re.sub(non_alpha, " ", cleaned_text)
129
  tokens = word_tokenize(cleaned_text)
130
- #print('tokens')
131
  # provide POS tag for lemmatization to yield better result
132
  word_tag_tuples = pos_tag(tokens, tagset="universal")
133
  tag_dict = {"NOUN": "n", "VERB": "v", "ADJ": "a", "ADV": "r"}
@@ -183,73 +226,83 @@ def plot_sentiment(tweet_df):
183
  fig.update_layout(showlegend=False)
184
  return fig
185
 
 
 
186
  def get_top_n_gram(tweet_df, ngram_range, n=10):
187
- stopwords = set()
188
- with open("static/en_stopwords_ngram.txt", "r") as file:
189
- for word in file:
190
- stopwords.add(word.rstrip("\n"))
191
- stopwords = list(stopwords)
192
- corpus = tweet_df["Tweet"]
193
- vectorizer = CountVectorizer(
194
- analyzer="word", ngram_range=ngram_range, stop_words=stopwords
195
- )
196
- X = vectorizer.fit_transform(corpus.astype(str).values)
197
- words = vectorizer.get_feature_names_out()
198
- words_count = np.ravel(X.sum(axis=0))
199
- df = pd.DataFrame(zip(words, words_count))
200
- df.columns = ["words", "counts"]
201
- df = df.sort_values(by="counts", ascending=False).head(n)
202
- df["words"] = df["words"].str.title()
203
- return df
 
 
 
204
 
205
  def plot_n_gram(n_gram_df, title, color="#54A24B"):
206
- fig = px.bar(
207
- # n_gram_df,
208
- # x="counts",
209
- # y="words",
210
- x=n_gram_df.counts,
211
- y=n_gram_df.words,
212
- title="<b>{}</b>".format(title),
213
- text_auto=True,
214
- )
215
- fig.update_layout(plot_bgcolor="white")
216
- fig.update_xaxes(title=None)
217
- fig.update_yaxes(autorange="reversed", title=None)
218
- fig.update_traces(hovertemplate="<b>%{y}</b><br>Count=%{x}", marker_color=color)
219
- return fig
 
 
 
 
220
 
221
- def plot_wordcloud(tweet_df, colormap="Greens"):
222
- stopwords = set()
223
- with open("static/en_stopwords_ngram.txt", "r") as file:
224
- for word in file:
225
- stopwords.add(word.rstrip("\n"))
226
- cmap = mpl.cm.get_cmap(colormap)(np.linspace(0, 1, 20))
227
- cmap = mpl.colors.ListedColormap(cmap[10:15])
228
- mask = np.array(Image.open("static/twitter_mask.png"))
229
- font = "static/quartzo.ttf"
230
- #tweet_df["Cleaned_Tweet"] = tweet_df["Tweet"].apply(lambda x: text_preprocessing(x))
231
- tweet_df["Cleaned_Tweet"] = tweet_df["Tweet"].apply(text_preprocessing)
232
- #print(tweet_df["Cleaned_Tweet"])
233
- text = " ".join(tweet_df["Cleaned_Tweet"])
234
- #print(text)
235
- wc = WordCloud(
236
- background_color="white",
237
- font_path=font,
238
- stopwords=stopwords,
239
- max_words=90,
240
- colormap=cmap,
241
- mask=mask,
242
- random_state=42,
243
- collocations=False,
244
- min_word_length=2,
245
- max_font_size=200,
246
- )
247
- wc.generate(text)
248
- fig = plt.figure(figsize=(8, 8))
249
- ax = fig.add_subplot(1, 1, 1)
250
- plt.imshow(wc, interpolation="bilinear")
251
- plt.axis("off")
252
- plt.title("Wordcloud", fontdict={"fontsize": 16}, fontweight="heavy", pad=20, y=1.0)
253
- return fig
 
254
 
255
 
 
6
  from transformers import pipeline
7
  import plotly.express as px
8
  import plotly.io as pio
9
+ import plotly.graph_objects as go
10
  import matplotlib as mpl
11
  import matplotlib.pyplot as plt
12
  from wordcloud import WordCloud
13
  from PIL import Image
14
+ import requests
15
+ from itertools import islice
16
+ from youtube_comment_downloader import *
17
+
18
 
19
  @st.cache(allow_output_mutation=True)
20
  def get_nltk():
 
113
  tweets_df.drop('Datetime', axis=1, inplace=True)
114
  return tweets_df
115
 
116
+ def get_youtube_comments(url, num_comments):
117
+ pattern = '"playabilityStatus":{"status":"ERROR","reason":"Video unavailable"'
118
+ def try_site(url):
119
+ request = requests.get(url)
120
+ return False if pattern in request.text else True
121
+
122
+ video_exists = try_site(url)
123
+ if video_exists:
124
+ comment_list = []
125
+ downloader = YoutubeCommentDownloader()
126
+ comments = downloader.get_comments_from_url(url, sort_by=SORT_BY_POPULAR)
127
+ for comment in islice(comments, num_comments):
128
+ comment_list.append(comment['text'])
129
+ return comment_list
130
+ else:
131
+ raise Exception('Video does not exist')
132
+
133
+ def get_sentiment_youtube(useful_sentence):
134
+ tokenizer = tokenizer_sentiment
135
+ model = model_sentiment
136
+ pipe = pipeline(model="ProsusAI/finbert")
137
+ classifier = pipeline(model="ProsusAI/finbert")
138
+ output=[]
139
+ i=0
140
+ useful_sentence_len = len(useful_sentence)
141
+ for temp in useful_sentence:
142
+ output.extend(classifier(temp))
143
+ i=i+1
144
+ df = pd.DataFrame.from_dict(useful_sentence)
145
+ df_temp = pd.DataFrame.from_dict(output)
146
+ df = pd.concat([df, df_temp], axis=1)
147
+ df = df.rename(columns={'label': 'Sentiment'})
148
+ df = df.rename(columns={0: 'Comment'})
149
+ df['Sentiment'] = df['Sentiment'].replace('positive', 'Positive')
150
+ df['Sentiment'] = df['Sentiment'].replace('negative', 'Negative')
151
+ df['Sentiment'] = df['Sentiment'].replace('neutral', 'Neutral')
152
+ return df
153
+
154
+
155
  def text_preprocessing(text):
156
  stopwords = set()
157
  with open("static/en_stopwords.txt", "r") as file:
 
171
  cleaned_text = re.sub(entity_pattern, " ", cleaned_text)
172
  cleaned_text = re.sub(non_alpha, " ", cleaned_text)
173
  tokens = word_tokenize(cleaned_text)
 
174
  # provide POS tag for lemmatization to yield better result
175
  word_tag_tuples = pos_tag(tokens, tagset="universal")
176
  tag_dict = {"NOUN": "n", "VERB": "v", "ADJ": "a", "ADV": "r"}
 
226
  fig.update_layout(showlegend=False)
227
  return fig
228
 
229
+
230
+
231
  def get_top_n_gram(tweet_df, ngram_range, n=10):
232
+ try:
233
+ stopwords = set()
234
+ with open("static/en_stopwords_ngram.txt", "r") as file:
235
+ for word in file:
236
+ stopwords.add(word.rstrip("\n"))
237
+ stopwords = list(stopwords)
238
+ corpus = tweet_df["Tweet"]
239
+ vectorizer = CountVectorizer(
240
+ analyzer="word", ngram_range=ngram_range, stop_words=stopwords
241
+ )
242
+ X = vectorizer.fit_transform(corpus.astype(str).values)
243
+ words = vectorizer.get_feature_names_out()
244
+ words_count = np.ravel(X.sum(axis=0))
245
+ df = pd.DataFrame(zip(words, words_count))
246
+ df.columns = ["words", "counts"]
247
+ df = df.sort_values(by="counts", ascending=False).head(n)
248
+ df["words"] = df["words"].str.title()
249
+ return df
250
+ except:
251
+ pass
252
 
253
  def plot_n_gram(n_gram_df, title, color="#54A24B"):
254
+ try:
255
+ fig = px.bar(
256
+ # n_gram_df,
257
+ # x="counts",
258
+ # y="words",
259
+ x=n_gram_df.counts,
260
+ y=n_gram_df.words,
261
+ title="<b>{}</b>".format(title),
262
+ text_auto=True,
263
+ )
264
+ fig.update_layout(plot_bgcolor="white")
265
+ fig.update_xaxes(title=None)
266
+ fig.update_yaxes(autorange="reversed", title=None)
267
+ fig.update_traces(hovertemplate="<b>%{y}</b><br>Count=%{x}", marker_color=color)
268
+ return fig
269
+ except:
270
+ fig = go.Figure()
271
+ return fig
272
 
273
+ def plot_wordcloud(tweet_df, colormap="Greens", mask_url="static/twitter_mask.png"):
274
+ try:
275
+ stopwords = set()
276
+ with open("static/en_stopwords_ngram.txt", "r") as file:
277
+ for word in file:
278
+ stopwords.add(word.rstrip("\n"))
279
+ cmap = mpl.cm.get_cmap(colormap)(np.linspace(0, 1, 20))
280
+ cmap = mpl.colors.ListedColormap(cmap[10:15])
281
+ mask = np.array(Image.open(mask_url))
282
+ font = "static/quartzo.ttf"
283
+ tweet_df["Cleaned_Tweet"] = tweet_df["Tweet"].apply(text_preprocessing)
284
+ text = " ".join(tweet_df["Cleaned_Tweet"])
285
+ wc = WordCloud(
286
+ background_color="white",
287
+ font_path=font,
288
+ stopwords=stopwords,
289
+ max_words=90,
290
+ colormap=cmap,
291
+ mask=mask,
292
+ random_state=42,
293
+ collocations=False,
294
+ min_word_length=2,
295
+ max_font_size=200,
296
+ )
297
+ wc.generate(text)
298
+ fig = plt.figure(figsize=(8, 8))
299
+ ax = fig.add_subplot(1, 1, 1)
300
+ plt.imshow(wc, interpolation="bilinear")
301
+ plt.axis("off")
302
+ plt.title("Wordcloud", fontdict={"fontsize": 16}, fontweight="heavy", pad=20, y=1.0)
303
+ return fig
304
+ except:
305
+ fig = go.Figure()
306
+ return fig
307
 
308
 
requirements.txt CHANGED
@@ -7,3 +7,4 @@ plotly==5.9.0
7
  nltk
8
  scikit-learn
9
  wordcloud
 
 
7
  nltk
8
  scikit-learn
9
  wordcloud
10
+ youtube-comment-downloader
static/yt_mask.png ADDED