Delete app.py
Browse files
app.py
DELETED
@@ -1,518 +0,0 @@
|
|
1 |
-
# Importing necessary libraries.
|
2 |
-
import streamlit as st
|
3 |
-
st.set_page_config(page_title="Monkeypox misinformation detector",
|
4 |
-
page_icon=":lion:",
|
5 |
-
layout="wide",
|
6 |
-
initial_sidebar_state="auto",
|
7 |
-
menu_items=None)
|
8 |
-
import tweepy as tw
|
9 |
-
import textacy
|
10 |
-
from textacy import preprocessing
|
11 |
-
import emoji
|
12 |
-
import pandas as pd
|
13 |
-
import numpy as np
|
14 |
-
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
|
15 |
-
import tensorflow as tf
|
16 |
-
import datetime as dt
|
17 |
-
import time
|
18 |
-
import copy
|
19 |
-
import altair as alt
|
20 |
-
|
21 |
-
|
22 |
-
@st.experimental_singleton(show_spinner=False)
|
23 |
-
def load_model():
|
24 |
-
"""
|
25 |
-
This function loads the fine-tuned HuggingFace model and caches
|
26 |
-
it (using the experimental_singleton decorator) to improve
|
27 |
-
computation times.
|
28 |
-
|
29 |
-
Parameters: none.
|
30 |
-
Returns: HuggingFace transformer model.
|
31 |
-
"""
|
32 |
-
|
33 |
-
model = TFAutoModelForSequenceClassification.from_pretrained("smcrone/monkeypox-misinformation")
|
34 |
-
model.compile(
|
35 |
-
optimizer=tf.keras.optimizers.Adam(learning_rate=5e-6),
|
36 |
-
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
|
37 |
-
metrics=tf.keras.metrics.SparseCategoricalAccuracy())
|
38 |
-
return model
|
39 |
-
|
40 |
-
|
41 |
-
@st.experimental_singleton(show_spinner=False)
|
42 |
-
def load_tokenizer():
|
43 |
-
"""
|
44 |
-
This function loads a tokenizer for the transformer model and caches
|
45 |
-
it (using the experimental_singleton decorator) to improve
|
46 |
-
computation times.
|
47 |
-
|
48 |
-
Parameters: none.
|
49 |
-
Returns: tokenizer.
|
50 |
-
"""
|
51 |
-
|
52 |
-
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased",use_fast=False)
|
53 |
-
return tokenizer
|
54 |
-
|
55 |
-
|
56 |
-
@st.experimental_singleton(show_spinner=False)
|
57 |
-
def load_client():
|
58 |
-
"""
|
59 |
-
This function authenticates the Tweepy client and caches
|
60 |
-
the object (using the experimental_singleton decorator) to
|
61 |
-
improve computation times.
|
62 |
-
|
63 |
-
Parameters: none.
|
64 |
-
Returns: Tweepy client.
|
65 |
-
"""
|
66 |
-
|
67 |
-
bearer_token = st.secrets["bearer_token"]
|
68 |
-
client = tw.Client(bearer_token,wait_on_rate_limit=True)
|
69 |
-
return client
|
70 |
-
|
71 |
-
|
72 |
-
def dataframe_preprocessing(df_to_preprocess:pd.DataFrame):
|
73 |
-
"""
|
74 |
-
The program overall collects tweet data at two junctures: firstly
|
75 |
-
on provision of the initial tweet, and secondly if the classification
|
76 |
-
of the initial tweet prompts a review of the user's other recent tweets.
|
77 |
-
At both of these junctures certain preprocessing steps -- designed to
|
78 |
-
increase the intelligibility of text inputs to the model -- are identical,
|
79 |
-
so this function is designed to avoid the unnecessary repetition of this
|
80 |
-
code. The function takes a Pandas DataFrame for preprocessing and returns
|
81 |
-
the DataFrame, having executed certain preprocessing steps (e.g. removal
|
82 |
-
of emojis, normalization of whitespace, removal of columns, etc.)
|
83 |
-
|
84 |
-
Parameters: df_to_preprocess (DataFrame)
|
85 |
-
Returns: df_to_preprocess (DataFrame)
|
86 |
-
"""
|
87 |
-
|
88 |
-
# userlocation will not be in dataframe is user not supplied field. So, for time being, fill with none if it does not exist.
|
89 |
-
# !!! note: we will likely NOT use userlocation, so can remove this bit of code in later versions!!!
|
90 |
-
if 'userlocation' not in df_to_preprocess.columns:
|
91 |
-
df_to_preprocess['userlocation'] = 'None'
|
92 |
-
# Dropping redundant columns.
|
93 |
-
df_to_preprocess = df_to_preprocess.drop(labels=['public_metrics', 'userpublic_metrics'], axis=1)
|
94 |
-
# Stripping timezone info for export to Excel.
|
95 |
-
df_to_preprocess['created_at'] = df_to_preprocess['created_at'].dt.tz_localize(None)
|
96 |
-
df_to_preprocess['usercreated_at'] = df_to_preprocess['usercreated_at'].dt.tz_localize(None)
|
97 |
-
# Replacing URLs and emojis; normalizing bullet points, whitespace, etc.
|
98 |
-
for feature in ['text','userdescription','userlocation','userurl','username']:
|
99 |
-
df_to_preprocess[feature] = df_to_preprocess[feature].fillna('None').apply(str)
|
100 |
-
df_to_preprocess[feature] = df_to_preprocess[feature].apply(lambda x: textacy.preprocessing.replace.urls(text= x, repl= '_URL_'))
|
101 |
-
df_to_preprocess[feature] = df_to_preprocess[feature].apply(lambda x: emoji.demojize(x))
|
102 |
-
df_to_preprocess[feature] = df_to_preprocess[feature].apply(lambda x: textacy.preprocessing.normalize.bullet_points(text=x))
|
103 |
-
df_to_preprocess[feature] = df_to_preprocess[feature].apply(lambda x: textacy.preprocessing.normalize.quotation_marks(text=x))
|
104 |
-
df_to_preprocess[feature] = df_to_preprocess[feature].apply(lambda x: textacy.preprocessing.normalize.whitespace(text=x))
|
105 |
-
df_to_preprocess[feature] = df_to_preprocess[feature].replace('\n', ' ', regex=True).replace('\r', '', regex=True)
|
106 |
-
# Renaming columns (for greater model intelligibility).
|
107 |
-
df_to_preprocess.rename(columns={"userverified": "user is verified",
|
108 |
-
"userurl": "user has url",
|
109 |
-
"userdescription": "user description",
|
110 |
-
"usercreated_at": "user created at",
|
111 |
-
"followers_count": "followers count",
|
112 |
-
"following_count": "following count",
|
113 |
-
"tweet_count": "tweet count",
|
114 |
-
"userlocation": "user location"},
|
115 |
-
inplace=True)
|
116 |
-
# Making URL column binary.
|
117 |
-
df_to_preprocess['user has url'].replace({'_URL_': 'True', "": 'False'}, inplace=True)
|
118 |
-
# Adding some extra features.
|
119 |
-
df_to_preprocess['years since account created'] = df_to_preprocess['created_at'].dt.year.astype('Int64') - df_to_preprocess['user created at'].dt.year.astype('Int64')
|
120 |
-
df_to_preprocess['tweets per day'] = df_to_preprocess['tweet count']/((df_to_preprocess['created_at'] - df_to_preprocess['user created at']).dt.days)
|
121 |
-
df_to_preprocess['follower to following ratio'] = df_to_preprocess['followers count']/(df_to_preprocess['following count']+1)
|
122 |
-
# Returning processed DataFrame.
|
123 |
-
return df_to_preprocess
|
124 |
-
|
125 |
-
|
126 |
-
def feature_concatenation(dataframe_to_concatenate:pd.DataFrame,features:list):
|
127 |
-
"""
|
128 |
-
Our transformer model was fine-tuned on text input that combines
|
129 |
-
a number of fields in a single string. This function performs
|
130 |
-
the concatenation of these features, which in addition to dataframe
|
131 |
-
preprocessing, is a necessary preprocessing step. The final dataframe
|
132 |
-
consists of just two columns: one containing the concatenated text and
|
133 |
-
the other containing the number of retweets that the tweet received
|
134 |
-
(for use later on).
|
135 |
-
|
136 |
-
Parameters:
|
137 |
-
|
138 |
-
1. dataframe_to_concatenate (DataFrame): the df from which to take the features.
|
139 |
-
2. features (list of str): the features to concatenate.
|
140 |
-
|
141 |
-
Returns:
|
142 |
-
|
143 |
-
1. finalDataFrame (DataFrame): the dataframe to be passed to the model.
|
144 |
-
"""
|
145 |
-
|
146 |
-
# Make copy of dataframe consisting only of specified features.
|
147 |
-
concatenated_dataframe = dataframe_to_concatenate[features].copy()
|
148 |
-
# Concatenate chosen features.
|
149 |
-
for i in features:
|
150 |
-
concatenated_dataframe[i] = concatenated_dataframe[i].name + ": " + concatenated_dataframe[i].astype(str)
|
151 |
-
concatenated_dataframe['combined'] = concatenated_dataframe[features].apply(lambda row: ' [SEP] '.join(row.values.astype(str)), axis=1)
|
152 |
-
final_concatenated_dataframe = pd.DataFrame({"combined":concatenated_dataframe['combined'],"retweets":dataframe_to_concatenate['retweet_count']})
|
153 |
-
# Return the final DataFrame.
|
154 |
-
return final_concatenated_dataframe
|
155 |
-
|
156 |
-
|
157 |
-
def classify_tweets(dataframe_to_classify:pd.DataFrame):
|
158 |
-
"""
|
159 |
-
This function takes a DataFrame of tweets which, having gone through
|
160 |
-
the necessary preprocessing steps, is ready to classify. The function
|
161 |
-
is called both for the initial classification of a single tweet and,
|
162 |
-
where necessary, the superspreader analysis of the user's recent tweets.
|
163 |
-
The function iterates through the DataFrame provided, tokenizing and
|
164 |
-
classifying each tweet, and assigning it to one of two lists within a
|
165 |
-
dictionary: 'goodPosts' (i.e. non-misleading posts) and 'badPosts (i.e.
|
166 |
-
misleading posts). The function then returns the dictionary, which for
|
167 |
-
each post includes the tweet itself, the predicted class, the confidence
|
168 |
-
of the prediction, and the number of retweets received by the post.
|
169 |
-
|
170 |
-
Parameters: dataframe_to_classify (DataFrame) -- the preprocessed
|
171 |
-
DataFrame of tweet(s).
|
172 |
-
|
173 |
-
Returns: tweet_dict (dict): a dictionary of classification results.
|
174 |
-
"""
|
175 |
-
|
176 |
-
# Storing classification results in a dictionary with two keys.
|
177 |
-
tweet_dict ={}
|
178 |
-
tweet_dict['goodPosts'] = []
|
179 |
-
tweet_dict['badPosts'] = []
|
180 |
-
# Iterate through each tweet string in the DataFrame provided.
|
181 |
-
for i in range(len(dataframe_to_classify['combined'])):
|
182 |
-
# First, tokenize the tweet.
|
183 |
-
tokenized_tweet = tokenizer(dataframe_to_classify['combined'].iloc[i],padding="max_length",truncation=True)
|
184 |
-
# Next, convert tweet to a format that TensorFlow will accept.
|
185 |
-
predict_dict = {}
|
186 |
-
for x,y in tokenized_tweet.items():
|
187 |
-
a = tf.convert_to_tensor(y, dtype=None, dtype_hint=None, name=None)
|
188 |
-
b = tf.reshape(a,[1,512])
|
189 |
-
predict_dict[x] = b
|
190 |
-
# Call model to predict tweet.
|
191 |
-
prediction = model(predict_dict,training=False)
|
192 |
-
# Take pred. class and confidence in pred. class
|
193 |
-
pred_class = np.argmax(np.array(tf.nn.softmax(prediction.logits)))
|
194 |
-
pred_conf = np.max(np.array(tf.nn.softmax(prediction.logits)))
|
195 |
-
# Construct a list of variables that we wish to store.
|
196 |
-
seq_to_append = [dataframe_to_classify['combined'].iloc[i],pred_class,pred_conf,dataframe_to_classify['retweets'].iloc[i]]
|
197 |
-
# Add list under appropriate dictionary key.
|
198 |
-
if pred_class == 1:
|
199 |
-
tweet_dict['badPosts'].append(seq_to_append)
|
200 |
-
elif pred_class == 0:
|
201 |
-
tweet_dict['goodPosts'].append(seq_to_append)
|
202 |
-
else:
|
203 |
-
print("Something went wrong.")
|
204 |
-
return
|
205 |
-
# Return the dictionary of results.
|
206 |
-
return tweet_dict
|
207 |
-
|
208 |
-
|
209 |
-
def get_user_tweets(user_id:str, days_to_go_back:int, client:tw.Client):
|
210 |
-
"""
|
211 |
-
If the initial tweet provided to the web app is classified as
|
212 |
-
misleading, then relevant tweets from the user must be gathered
|
213 |
-
in order to perform the superspreader calculation. This function
|
214 |
-
supports this process by collecting relevant user tweets, undertaking
|
215 |
-
the necessary preprocessing steps (with support from other functions),
|
216 |
-
and classifying the tweets using the classify_tweets function. It
|
217 |
-
then returns the dictionary of results produced by classify_tweets.
|
218 |
-
|
219 |
-
Parameters:
|
220 |
-
|
221 |
-
1. user_id (int|str): the user_id to be fed to Tweepy.
|
222 |
-
2. days_to_go_back (int): how many days' tweets to investigate.
|
223 |
-
3. client: the Tweepy client instantiated by load_client.
|
224 |
-
|
225 |
-
Returns:
|
226 |
-
|
227 |
-
1. user_tweets_classified (dict): model outputs for user tweets.
|
228 |
-
"""
|
229 |
-
|
230 |
-
# STAGE 1. FETCH USER TWEETS
|
231 |
-
|
232 |
-
# Converting days_to_go_back into variables that can be fed to Tweepy.
|
233 |
-
d = dt.datetime.today() - dt.timedelta(days=days_to_go_back)
|
234 |
-
year = str(d.year)
|
235 |
-
month = str(d.month)
|
236 |
-
if len(month) == 1:
|
237 |
-
month = '0'+month
|
238 |
-
day = str(d.day)
|
239 |
-
if len(day) == 1:
|
240 |
-
day = '0'+day
|
241 |
-
hour = str(d.hour)
|
242 |
-
if len(hour) == 1:
|
243 |
-
hour = '0'+hour
|
244 |
-
# Gathering tweets from user.
|
245 |
-
try:
|
246 |
-
tweets_we_want_to_check = tw.Paginator(client.get_users_tweets,
|
247 |
-
id = user_id,
|
248 |
-
end_time=None,
|
249 |
-
exclude=None,
|
250 |
-
expansions=['author_id'],
|
251 |
-
max_results=100,
|
252 |
-
media_fields=None,
|
253 |
-
pagination_token=None,
|
254 |
-
place_fields=None,
|
255 |
-
poll_fields=None,
|
256 |
-
since_id=None,
|
257 |
-
start_time='{}-{}-{}T{}:00:00Z'.format(year,month,day,hour),
|
258 |
-
tweet_fields=['author_id','created_at','public_metrics','source'],
|
259 |
-
until_id=None,
|
260 |
-
user_fields=['created_at','description','location','public_metrics','url','verified'],
|
261 |
-
user_auth=False,
|
262 |
-
limit=500)
|
263 |
-
except:
|
264 |
-
return "Something went wrong whilst performing superspreader analysis."
|
265 |
-
|
266 |
-
# STAGE 2. PREPROCESSING TWEET DATA
|
267 |
-
|
268 |
-
# Parsing response data into an intermediate form.
|
269 |
-
tweet_data_for_user = []
|
270 |
-
user_data_for_user = []
|
271 |
-
for page in tweets_we_want_to_check:
|
272 |
-
# Converting each set of tweet fields into a dict and appending to list.
|
273 |
-
for tweet in page.data:
|
274 |
-
result = dict(tweet)
|
275 |
-
tweet_data_for_user.append(result)
|
276 |
-
# Converting each set of user fields into a dict and appending to list.
|
277 |
-
for user in page.includes['users']:
|
278 |
-
result = dict(user)
|
279 |
-
user_data_for_user.append(result)
|
280 |
-
# Adding user fields to tweet fields.
|
281 |
-
for tweet in tweet_data_for_user:
|
282 |
-
for user in user_data_for_user:
|
283 |
-
for key, val in user.items():
|
284 |
-
newKey = "user"+key
|
285 |
-
tweet[newKey] = val
|
286 |
-
break
|
287 |
-
# Unpack and append any values that are dictionaries.
|
288 |
-
for tweet in tweet_data_for_user:
|
289 |
-
additional_values = {}
|
290 |
-
for key, val in tweet.items():
|
291 |
-
if type(val) == dict:
|
292 |
-
for subkey, subval in val.items():
|
293 |
-
additional_values[subkey] = subval
|
294 |
-
tweet.update(additional_values)
|
295 |
-
# Create a Pandas DataFrame to store the data.
|
296 |
-
user_df = pd.DataFrame(tweet_data_for_user)
|
297 |
-
# Perform additional preprocessing using dedicated function.
|
298 |
-
user_df = dataframe_preprocessing(user_df)
|
299 |
-
# Drop non-monkeypox related rows.
|
300 |
-
user_df['monkeypox'] = user_df['text'].str.contains('monkeypox|monkey pox|money pox', case=False, regex=True)
|
301 |
-
user_df.drop(user_df[user_df.monkeypox == False].index, inplace=True)
|
302 |
-
# Concatenating chosen features.
|
303 |
-
concatenated_df = feature_concatenation(user_df,['text'])
|
304 |
-
|
305 |
-
# STAGE 3. CALLING CLASSIFIER AND RETURNING RESULTS
|
306 |
-
|
307 |
-
# Calling classifier.
|
308 |
-
classified_tweets = classify_tweets(concatenated_df)
|
309 |
-
# Returning dictionary of classified tweets.
|
310 |
-
return classified_tweets
|
311 |
-
|
312 |
-
|
313 |
-
def on_receipt_of_tweet_query(request:str,client:tw.Client):
|
314 |
-
"""
|
315 |
-
This function defines what the app should do on receipt of a tweet
|
316 |
-
URL / ID from the end-user. It performs the following steps:
|
317 |
-
(i) formats the string submitted by the userinto a parsable form;
|
318 |
-
(ii) fetches data for the tweet using Tweepy; (iii) performs some
|
319 |
-
basic preprocessing on the data; (iv) calls dedicated preprocessing
|
320 |
-
functions to finish preprocessing the data; (v) calls the classifier
|
321 |
-
on the tweet; (vi) determines whether superspreader analysis is
|
322 |
-
needed (i.e. if tweet is classed as misleading); (vii) if so,
|
323 |
-
calls get_user_tweet function and calculates a superspreader score;
|
324 |
-
(viii) returns a tuple of data for the application to display.
|
325 |
-
|
326 |
-
Parameters:
|
327 |
-
|
328 |
-
1. request (str): the URL or ID provided by the end-user.
|
329 |
-
2. client: the Tweepy client instantiated by load_client.
|
330 |
-
|
331 |
-
Returns:
|
332 |
-
|
333 |
-
1. classified_tweet (dict): the metrics returned for the tweet by classify_tweets.
|
334 |
-
2. spreader_score (float): where applicable, a metric representing the
|
335 |
-
3. extent to which the user can be regarded as a superspreader of misinformation.
|
336 |
-
4. tweet_text (str): the text of the tweet queried by the end-user.
|
337 |
-
5. followers_count (int): the number of followers that the user has.
|
338 |
-
6. classified_user_tweets (dict): where applicable, the metrics returned by
|
339 |
-
7. get_user_tweets.
|
340 |
-
"""
|
341 |
-
|
342 |
-
# STAGE 1. FETCH DATA FOR REQUESTED TWEET
|
343 |
-
|
344 |
-
# If URL is provided by the end-user, strip out the tweet ID.
|
345 |
-
if '/' in request:
|
346 |
-
request = request.split('/')[-1]
|
347 |
-
# Collect tweet data -- interrupt if invalid input provided.
|
348 |
-
tweet = client.get_tweets(ids=request,
|
349 |
-
expansions=['author_id'],
|
350 |
-
media_fields=None,
|
351 |
-
place_fields=None,
|
352 |
-
poll_fields=None,
|
353 |
-
tweet_fields=['author_id','created_at','public_metrics','source'],
|
354 |
-
user_fields=['created_at','description','location','public_metrics','url','verified'],
|
355 |
-
user_auth=False)
|
356 |
-
|
357 |
-
# STAGE 2. PREPROCESSING OF TWEET DATA
|
358 |
-
|
359 |
-
# Create dictionaries out of the tweet and user data.
|
360 |
-
for i in tweet.data:
|
361 |
-
tweet_fields = dict(i)
|
362 |
-
for i in tweet.includes['users']:
|
363 |
-
user_fields = dict(i)
|
364 |
-
# Add the data from the user dict to the tweet dict.
|
365 |
-
for key, val in user_fields.items():
|
366 |
-
newKey = "user"+key
|
367 |
-
tweet_fields[newKey] = val
|
368 |
-
# Unpack any values which are themselves dictionaries.
|
369 |
-
additional_values = {}
|
370 |
-
for key, val in tweet_fields.items():
|
371 |
-
if type(val) == dict:
|
372 |
-
for subkey, subval in val.items():
|
373 |
-
additional_values[subkey] = subval
|
374 |
-
tweet_fields.update(additional_values)
|
375 |
-
# Convert everything to a DataFrame.
|
376 |
-
tweet_df = pd.DataFrame(tweet_fields,index=[0])
|
377 |
-
# Store the raw tweet text itself for later use.
|
378 |
-
tweet_text = tweet_df['text'][0]
|
379 |
-
# Store the followers count for later use.
|
380 |
-
followers_count = tweet_df['followers_count'][0]
|
381 |
-
# Preprocess the data using dedicated functions.
|
382 |
-
tweet_df = dataframe_preprocessing(tweet_df)
|
383 |
-
concatenated_tweet_df = feature_concatenation(tweet_df,['text'])
|
384 |
-
|
385 |
-
# STAGE 3. CALLING CLASSIFIER AND DETERMINING NEXT STEPS
|
386 |
-
|
387 |
-
# Call the classifier on the tweet.
|
388 |
-
classified_tweet = classify_tweets(concatenated_tweet_df)
|
389 |
-
# If the tweet is misleading, call get_user_tweets and calculate
|
390 |
-
# the user's superspreader score.
|
391 |
-
if len(classified_tweet['badPosts']) == 1:
|
392 |
-
# Fetch a dictionary of classified user tweets
|
393 |
-
classified_user_tweets = get_user_tweets(tweet_df['userid'][0],14,client=client)
|
394 |
-
# Calculate the total number of retweets for all misleading posts.
|
395 |
-
retweets_total = 0
|
396 |
-
for tweet in classified_user_tweets['badPosts']:
|
397 |
-
retweets_total += tweet[-1]
|
398 |
-
# Assign the p (post) value.
|
399 |
-
p = (0.21 * len(classified_user_tweets['badPosts'])) ** 1.13
|
400 |
-
# Assign the f (follower) value
|
401 |
-
f = (0.25 * (np.log10(followers_count+1))) ** 4.73
|
402 |
-
# Assign the r (retweet) value
|
403 |
-
r = (1.04 * (np.log10(retweets_total+1))) ** 0.96
|
404 |
-
# Calculate spreader_score and return a tuple of info.
|
405 |
-
spreader_score = max(((1 - (1/(max(1,p+f+r))))*100),1)
|
406 |
-
return classified_tweet, tweet_text, followers_count, classified_user_tweets, retweets_total, spreader_score,
|
407 |
-
# Otherwise, if tweet is not misleading, return the same info
|
408 |
-
# (excluding any superspreader related variables).
|
409 |
-
elif len(classified_tweet['goodPosts']) == 1:
|
410 |
-
return classified_tweet, tweet_text, followers_count, 0, 0, 0
|
411 |
-
# Contingency in case an error should unexpectedly occur.
|
412 |
-
else:
|
413 |
-
raise Exception("Something went wrong whilst processing tweet data.")
|
414 |
-
|
415 |
-
|
416 |
-
def webpage():
|
417 |
-
"""
|
418 |
-
This function structures the main page of the web app using the
|
419 |
-
conventions of Streamlit. It begins by loading the model, the tokenizer
|
420 |
-
and the Tweepy client using the functions dedicated to those tasks.
|
421 |
-
Each of these elements is then cached. The remaining content that the
|
422 |
-
function generates then depends mostly on the inputs provided by the
|
423 |
-
end-user.
|
424 |
-
|
425 |
-
Parameters: none.
|
426 |
-
Returns: nothing.
|
427 |
-
"""
|
428 |
-
|
429 |
-
# Create a container for displaying loading messages which will clear
|
430 |
-
# once the tokenizer, Tweepy client and transformer model have loaded.
|
431 |
-
loading_container = st.empty()
|
432 |
-
with loading_container.container():
|
433 |
-
global model
|
434 |
-
model = load_model()
|
435 |
-
global client
|
436 |
-
client = load_client()
|
437 |
-
global tokenizer
|
438 |
-
tokenizer = load_tokenizer()
|
439 |
-
loading_container.empty()
|
440 |
-
|
441 |
-
# Write header content (e.g. banner image, title, description).
|
442 |
-
st.image("monkeypox-small.jpg")
|
443 |
-
st.title("Monkeypox misinformation detector")
|
444 |
-
st.write("Use this tool to detect whether a tweet contains\
|
445 |
-
monkeypox misinformation and assess the extent to which its\
|
446 |
-
poster can be considered a misinformation superspreader.")
|
447 |
-
|
448 |
-
st.sidebar.subheader("About")
|
449 |
-
st.sidebar.write("This app has been developed using a\
|
450 |
-
[COVID-Twitter-BERT](https://huggingface.co/digitalepidemiologylab/covid-twitter-bert-v2)\
|
451 |
-
model fine-tuned on a monkeypox misinformation\
|
452 |
-
dataset. Users can learn more about the\
|
453 |
-
[model](https://www.bbc.co.uk/sport) on the\
|
454 |
-
HuggingFace model repository and can explore on\
|
455 |
-
Kaggle the [dataset](https://www.kaggle.com/datasets/stephencrone/monkeypox)\
|
456 |
-
on which the model was trained. Further\
|
457 |
-
[documentation](https://www.kaggle.com/datasets/stephencrone/monkeypox),\
|
458 |
-
as well as the source code for the app, can be\
|
459 |
-
found in the project's GitHub repository.")
|
460 |
-
|
461 |
-
st.sidebar.subheader("Contact")
|
462 |
-
st.sidebar.write("If you have any questions, comments or feedback\
|
463 |
-
regarding this app that are not answered by the\
|
464 |
-
supporting documentation for the underpinning\
|
465 |
-
dataset or transformer model, please feel free\
|
466 |
-
to contact the author at sgscrone@liverpool.ac.uk.")
|
467 |
-
|
468 |
-
# Provide a text box for user to enter tweet ID / URL.
|
469 |
-
tweet_to_check = st.text_input("Please provide a tweet URL or ID", key="name")
|
470 |
-
# If the string provided by the user is empty, do nothing.
|
471 |
-
if tweet_to_check != "":
|
472 |
-
# Otherwise, if string is not empty, try fetching tweet using function.
|
473 |
-
try:
|
474 |
-
classified_tweet, tweet_text, followers_count, classified_user_tweets, retweets_total, spreader_score = on_receipt_of_tweet_query(tweet_to_check,client)
|
475 |
-
st.markdown("""<hr style="height:1px;border:none;background-color:#a6a6a6; margin-top:16px; margin-bottom:20px;" /> """, unsafe_allow_html=True)
|
476 |
-
col1, col2 = st.columns(2)
|
477 |
-
# In left column, present tweet text.
|
478 |
-
col1.subheader("Tweet")
|
479 |
-
tweet_text = textacy.preprocessing.normalize.whitespace(tweet_text)
|
480 |
-
col1.markdown('<p style="background-color: #F0F2F6; padding: 8px 8px 8px 8px;">{}{}</p>'.format(tweet_text,type(tweet_text)),unsafe_allow_html=True)
|
481 |
-
# In right column, present tweet classification.
|
482 |
-
col2.subheader("Rating for this tweet")
|
483 |
-
if len(classified_tweet['goodPosts']) != 0:
|
484 |
-
# Format blue for not misinformation.
|
485 |
-
col2.markdown('<p style="color:White; background-color: #1661AD; text-align: center; font-size: 20px;">Not misinformation</p>',unsafe_allow_html=True)
|
486 |
-
col2.markdown('<p style="font-size: 40px; text-align: center;">{}</p>'.format(format(classified_tweet['goodPosts'][0][2],'.0%')), unsafe_allow_html=True)
|
487 |
-
col2.markdown('<p style="text-align: center;">confidence level</p>', unsafe_allow_html=True)
|
488 |
-
else:
|
489 |
-
# Format red for misinformation.
|
490 |
-
col2.markdown('<p style="color:White; background-color: #701B20; text-align: center; font-size: 20px;">Misinformation</p>',unsafe_allow_html=True)
|
491 |
-
col2.markdown('<p style="font-size: 40px; text-align: center;">{}</p>'.format(format(classified_tweet['badPosts'][0][2],'.0%')), unsafe_allow_html=True)
|
492 |
-
col2.markdown('<p style="text-align: center;">confidence level</p>', unsafe_allow_html=True)
|
493 |
-
# Add additional container to display superspreader analysis.
|
494 |
-
superspreader_container = st.container()
|
495 |
-
superspreader_container.subheader("Superspreader rating for this user")
|
496 |
-
# Plot the superspreader score as a bar chart.
|
497 |
-
score_to_plot = pd.DataFrame({"classified_tweet":["score"],"spreader_score":[spreader_score]})
|
498 |
-
bar = alt.Chart(score_to_plot).mark_bar().encode(alt.X('spreader_score:Q',scale=alt.Scale(domain=(0, 100)), axis=None), alt.Y('classified_tweet',axis=None)).properties(height=60)
|
499 |
-
if spreader_score > 10:
|
500 |
-
label = bar.mark_text(align='right',baseline='middle', dx=-10, color='white', fontSize=20).encode(text=alt.Text("spreader_score:Q", format=",.0f"))
|
501 |
-
else:
|
502 |
-
label = bar.mark_text(align='right',baseline='middle', dx=25, color='black', fontSize=20).encode(text=alt.Text("spreader_score:Q", format=",.0f"))
|
503 |
-
x = bar+label
|
504 |
-
x = x.configure_mark(color='#701B20')
|
505 |
-
superspreader_container.altair_chart(x, use_container_width=True)
|
506 |
-
# Display stats on which calculation was based.
|
507 |
-
superspreader_container.write("Based on the user's **{:,} followers** and the following **{} tweet(s)** published over the last two weeks, which together received **{:,} retweet(s)**.".format(followers_count,len(classified_user_tweets['badPosts']),retweets_total))
|
508 |
-
# And print offending tweets from user's recent history.
|
509 |
-
for i in range(len(classified_user_tweets['badPosts'])):
|
510 |
-
recent_tweet = classified_user_tweets['badPosts'][i][0]
|
511 |
-
recent_tweet = recent_tweet.split('text:')[-1]
|
512 |
-
superspreader_container.markdown('<p style="background-color: #F0F2F6; padding: 8px 8px 8px 8px;">{}</p>'.format(recent_tweet),unsafe_allow_html=True)
|
513 |
-
except:
|
514 |
-
st.error("Could not retrieve information for tweet. Please ensure you are supplying a valid tweet ID or URL.")
|
515 |
-
st.markdown("""<hr style="height:1px;border:none;background-color:#a6a6a6; margin-top:16px; margin-bottom:20px;" /> """, unsafe_allow_html=True)
|
516 |
-
|
517 |
-
if __name__ == "__main__":
|
518 |
-
webpage()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|