Spaces:

smcrone
/

monkeypox

Runtime error

App Files Files Community

smcrone commited on Aug 30, 2022

Commit

665b455

•

1 Parent(s): 2460a9e

Upload model.py

Browse files

Files changed (1) hide show

model.py +518 -0

model.py ADDED Viewed

	@@ -0,0 +1,518 @@

+# Importing necessary libraries.
+import streamlit as st
+st.set_page_config(page_title="Monkeypox misinformation detector",
+                   page_icon=":lion:",
+                   layout="wide",
+                   initial_sidebar_state="auto",
+                   menu_items=None)
+import tweepy as tw
+import textacy
+from textacy import preprocessing
+import emoji
+import pandas as pd
+import numpy as np
+from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+import tensorflow as tf
+import datetime as dt
+import time
+import copy
+import altair as alt
+@st.experimental_singleton(show_spinner=False)
+def load_model():
+    """
+    This function loads the fine-tuned HuggingFace model and caches
+    it (using the experimental_singleton decorator) to improve
+    computation times.
+    Parameters: none.
+    Returns: HuggingFace transformer model.
+    """
+    model = TFAutoModelForSequenceClassification.from_pretrained("smcrone/monkeypox-misinformation")
+    model.compile(
+        optimizer=tf.keras.optimizers.Adam(learning_rate=5e-6),
+        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=tf.keras.metrics.SparseCategoricalAccuracy())
+    return model
+@st.experimental_singleton(show_spinner=False)
+def load_tokenizer():
+    """
+    This function loads a tokenizer for the transformer model and caches
+    it (using the experimental_singleton decorator) to improve
+    computation times.
+    Parameters: none.
+    Returns: tokenizer.
+    """
+    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased",use_fast=False)
+    return tokenizer
+@st.experimental_singleton(show_spinner=False)
+def load_client():
+    """
+    This function authenticates the Tweepy client and caches
+    the object (using the experimental_singleton decorator) to
+    improve computation times.
+    Parameters: none.
+    Returns: Tweepy client.
+    """
+    bearer_token = st.secrets["bearer_token"]
+    client = tw.Client(bearer_token,wait_on_rate_limit=True)
+    return client
+def dataframe_preprocessing(df_to_preprocess:pd.DataFrame):
+    """
+    The program overall collects tweet data at two junctures: firstly
+    on provision of the initial tweet, and secondly if the classification
+    of the initial tweet prompts a review of the user's other recent tweets.
+    At both of these junctures certain preprocessing steps -- designed to
+    increase the intelligibility of text inputs to the model -- are identical,
+    so this function is designed to avoid the unnecessary repetition of this
+    code. The function takes a Pandas DataFrame for preprocessing and returns
+    the DataFrame, having executed certain preprocessing steps (e.g. removal
+    of emojis, normalization of whitespace, removal of columns, etc.)
+    Parameters: df_to_preprocess (DataFrame)
+    Returns: df_to_preprocess (DataFrame)
+    """
+    # userlocation will not be in dataframe is user not supplied field. So, for time being, fill with none if it does not exist.
+    # !!! note: we will likely NOT use userlocation, so can remove this bit of code in later versions!!!
+    if 'userlocation' not in df_to_preprocess.columns:
+        df_to_preprocess['userlocation'] = 'None'
+    # Dropping redundant columns.
+    df_to_preprocess = df_to_preprocess.drop(labels=['public_metrics', 'userpublic_metrics'], axis=1)
+    # Stripping timezone info for export to Excel.
+    df_to_preprocess['created_at'] = df_to_preprocess['created_at'].dt.tz_localize(None)
+    df_to_preprocess['usercreated_at'] = df_to_preprocess['usercreated_at'].dt.tz_localize(None)
+    # Replacing URLs and emojis; normalizing bullet points, whitespace, etc.
+    for feature in ['text','userdescription','userlocation','userurl','username']:
+        df_to_preprocess[feature] = df_to_preprocess[feature].fillna('None').apply(str)
+        df_to_preprocess[feature] = df_to_preprocess[feature].apply(lambda x: textacy.preprocessing.replace.urls(text= x, repl= '_URL_'))
+        df_to_preprocess[feature] = df_to_preprocess[feature].apply(lambda x: emoji.demojize(x))
+        df_to_preprocess[feature] = df_to_preprocess[feature].apply(lambda x: textacy.preprocessing.normalize.bullet_points(text=x))
+        df_to_preprocess[feature] = df_to_preprocess[feature].apply(lambda x: textacy.preprocessing.normalize.quotation_marks(text=x))
+        df_to_preprocess[feature] = df_to_preprocess[feature].apply(lambda x: textacy.preprocessing.normalize.whitespace(text=x))
+        df_to_preprocess[feature] = df_to_preprocess[feature].replace('\n', ' ', regex=True).replace('\r', '', regex=True)
+    # Renaming columns (for greater model intelligibility).
+    df_to_preprocess.rename(columns={"userverified": "user is verified",
+                        "userurl": "user has url",
+                        "userdescription": "user description",
+                        "usercreated_at": "user created at",
+                        "followers_count": "followers count",
+                        "following_count": "following count",
+                        "tweet_count": "tweet count",
+                        "userlocation": "user location"},
+                        inplace=True)
+    # Making URL column binary.
+    df_to_preprocess['user has url'].replace({'_URL_': 'True', "": 'False'}, inplace=True)
+    # Adding some extra features.
+    df_to_preprocess['years since account created'] = df_to_preprocess['created_at'].dt.year.astype('Int64') - df_to_preprocess['user created at'].dt.year.astype('Int64')
+    df_to_preprocess['tweets per day'] = df_to_preprocess['tweet count']/((df_to_preprocess['created_at'] - df_to_preprocess['user created at']).dt.days)
+    df_to_preprocess['follower to following ratio'] = df_to_preprocess['followers count']/(df_to_preprocess['following count']+1)
+    # Returning processed DataFrame.
+    return df_to_preprocess
+def feature_concatenation(dataframe_to_concatenate:pd.DataFrame,features:list):
+    """
+    Our transformer model was fine-tuned on text input that combines
+    a number of fields in a single string. This function performs
+    the concatenation of these features, which in addition to dataframe
+    preprocessing, is a necessary preprocessing step. The final dataframe
+    consists of just two columns: one containing the concatenated text and
+    the other containing the number of retweets that the tweet received
+    (for use later on).
+    Parameters:
+    1. dataframe_to_concatenate (DataFrame): the df from which to take the features.
+    2. features (list of str): the features to concatenate.
+    Returns:
+    1. finalDataFrame (DataFrame): the dataframe to be passed to the model.
+    """
+    # Make copy of dataframe consisting only of specified features.
+    concatenated_dataframe = dataframe_to_concatenate[features].copy()
+    # Concatenate chosen features.
+    for i in features:
+        concatenated_dataframe[i] = concatenated_dataframe[i].name + ": " + concatenated_dataframe[i].astype(str)
+        concatenated_dataframe['combined'] = concatenated_dataframe[features].apply(lambda row: ' [SEP] '.join(row.values.astype(str)), axis=1)
+    final_concatenated_dataframe = pd.DataFrame({"combined":concatenated_dataframe['combined'],"retweets":dataframe_to_concatenate['retweet_count']})
+    # Return the final DataFrame.
+    return final_concatenated_dataframe
+def classify_tweets(dataframe_to_classify:pd.DataFrame):
+    """
+    This function takes a DataFrame of tweets which, having gone through
+    the necessary preprocessing steps, is ready to classify. The function
+    is called both for the initial classification of a single tweet and,
+    where necessary, the superspreader analysis of the user's recent tweets.
+    The function iterates through the DataFrame provided, tokenizing and
+    classifying each tweet, and assigning it to one of two lists within a
+    dictionary: 'goodPosts' (i.e. non-misleading posts) and 'badPosts (i.e.
+    misleading posts). The function then returns the dictionary, which for
+    each post includes the tweet itself, the predicted class, the confidence
+    of the prediction, and the number of retweets received by the post.
+    Parameters: dataframe_to_classify (DataFrame) -- the preprocessed
+    DataFrame of tweet(s).
+    Returns: tweet_dict (dict): a dictionary of classification results.
+    """
+    # Storing classification results in a dictionary with two keys.
+    tweet_dict ={}
+    tweet_dict['goodPosts'] = []
+    tweet_dict['badPosts'] = []
+    # Iterate through each tweet string in the DataFrame provided.
+    for i in range(len(dataframe_to_classify['combined'])):
+        # First, tokenize the tweet.
+        tokenized_tweet = tokenizer(dataframe_to_classify['combined'].iloc[i],padding="max_length",truncation=True)
+        # Next, convert tweet to a format that TensorFlow will accept.
+        predict_dict = {}
+        for x,y in tokenized_tweet.items():
+            a = tf.convert_to_tensor(y, dtype=None, dtype_hint=None, name=None)
+            b = tf.reshape(a,[1,512])
+            predict_dict[x] = b
+        # Call model to predict tweet.
+        prediction = model(predict_dict,training=False)
+        # Take pred. class and confidence in pred. class
+        pred_class = np.argmax(np.array(tf.nn.softmax(prediction.logits)))
+        pred_conf = np.max(np.array(tf.nn.softmax(prediction.logits)))
+        # Construct a list of variables that we wish to store.
+        seq_to_append = [dataframe_to_classify['combined'].iloc[i],pred_class,pred_conf,dataframe_to_classify['retweets'].iloc[i]]
+        # Add list under appropriate dictionary key.
+        if pred_class == 1:
+            tweet_dict['badPosts'].append(seq_to_append)
+        elif pred_class == 0:
+            tweet_dict['goodPosts'].append(seq_to_append)
+        else:
+            print("Something went wrong.")
+            return
+    # Return the dictionary of results.
+    return tweet_dict
+def get_user_tweets(user_id:str, days_to_go_back:int, client:tw.Client):
+    """
+    If the initial tweet provided to the web app is classified as
+    misleading, then relevant tweets from the user must be gathered
+    in order to perform the superspreader calculation. This function
+    supports this process by collecting relevant user tweets, undertaking
+    the necessary preprocessing steps (with support from other functions),
+    and classifying the tweets using the classify_tweets function. It
+    then returns the dictionary of results produced by classify_tweets.
+    Parameters:
+    1. user_id (int|str): the user_id to be fed to Tweepy.
+    2. days_to_go_back (int): how many days' tweets to investigate.
+    3. client: the Tweepy client instantiated by load_client.
+    Returns:
+    1. user_tweets_classified (dict): model outputs for user tweets.
+    """
+    # STAGE 1. FETCH USER TWEETS
+    # Converting days_to_go_back into variables that can be fed to Tweepy.
+    d = dt.datetime.today() - dt.timedelta(days=days_to_go_back)
+    year = str(d.year)
+    month = str(d.month)
+    if len(month) == 1:
+        month = '0'+month
+    day = str(d.day)
+    if len(day) == 1:
+        day = '0'+day
+    hour = str(d.hour)
+    if len(hour) == 1:
+        hour = '0'+hour
+    # Gathering tweets from user.
+    try:
+        tweets_we_want_to_check = tw.Paginator(client.get_users_tweets,
+                                               id = user_id,
+                                               end_time=None,
+                                               exclude=None,
+                                               expansions=['author_id'],
+                                               max_results=100,
+                                               media_fields=None,
+                                               pagination_token=None,
+                                               place_fields=None,
+                                               poll_fields=None,
+                                               since_id=None,
+                                               start_time='{}-{}-{}T{}:00:00Z'.format(year,month,day,hour),
+                                               tweet_fields=['author_id','created_at','public_metrics','source'],
+                                               until_id=None,
+                                               user_fields=['created_at','description','location','public_metrics','url','verified'],
+                                               user_auth=False,
+                                               limit=500)
+    except:
+        return "Something went wrong whilst performing superspreader analysis."
+    # STAGE 2. PREPROCESSING TWEET DATA
+    # Parsing response data into an intermediate form.
+    tweet_data_for_user = []
+    user_data_for_user = []
+    for page in tweets_we_want_to_check:
+        # Converting each set of tweet fields into a dict and appending to list.
+        for tweet in page.data:
+            result = dict(tweet)
+            tweet_data_for_user.append(result)
+        # Converting each set of user fields into a dict and appending to list.
+        for user in page.includes['users']:
+            result = dict(user)
+            user_data_for_user.append(result)
+    # Adding user fields to tweet fields.
+    for tweet in tweet_data_for_user:
+        for user in user_data_for_user:
+            for key, val in user.items():
+                newKey = "user"+key
+                tweet[newKey] = val
+            break
+    # Unpack and append any values that are dictionaries.
+    for tweet in tweet_data_for_user:
+        additional_values = {}
+        for key, val in tweet.items():
+            if type(val) == dict:
+                for subkey, subval in val.items():
+                    additional_values[subkey] = subval
+        tweet.update(additional_values)
+    # Create a Pandas DataFrame to store the data.
+    user_df = pd.DataFrame(tweet_data_for_user)
+    # Perform additional preprocessing using dedicated function.
+    user_df = dataframe_preprocessing(user_df)
+    # Drop non-monkeypox related rows.
+    user_df['monkeypox'] = user_df['text'].str.contains('monkeypox|monkey pox|money pox', case=False, regex=True)
+    user_df.drop(user_df[user_df.monkeypox == False].index, inplace=True)
+    # Concatenating chosen features.
+    concatenated_df = feature_concatenation(user_df,['text'])
+    # STAGE 3. CALLING CLASSIFIER AND RETURNING RESULTS
+    # Calling classifier.
+    classified_tweets = classify_tweets(concatenated_df)
+    # Returning dictionary of classified tweets.
+    return classified_tweets
+def on_receipt_of_tweet_query(request:str,client:tw.Client):
+    """
+    This function defines what the app should do on receipt of a tweet
+    URL / ID from the end-user. It performs the following steps:
+    (i) formats the string submitted by the userinto a parsable form;
+    (ii) fetches data for the tweet using Tweepy; (iii) performs some
+    basic preprocessing on the data; (iv) calls dedicated preprocessing
+    functions to finish preprocessing the data; (v) calls the classifier
+    on the tweet; (vi) determines whether superspreader analysis is
+    needed (i.e. if tweet is classed as misleading); (vii) if so,
+    calls get_user_tweet function and calculates a superspreader score;
+    (viii) returns a tuple of data for the application to display.
+    Parameters:
+    1. request (str): the URL or ID provided by the end-user.
+    2. client: the Tweepy client instantiated by load_client.
+    Returns:
+    1. classified_tweet (dict): the metrics returned for the tweet by classify_tweets.
+    2. spreader_score (float): where applicable, a metric representing the
+    3. extent to which the user can be regarded as a superspreader of misinformation.
+    4. tweet_text (str): the text of the tweet queried by the end-user.
+    5. followers_count (int): the number of followers that the user has.
+    6. classified_user_tweets (dict): where applicable, the metrics returned by
+    7. get_user_tweets.
+    """
+    # STAGE 1. FETCH DATA FOR REQUESTED TWEET
+    # If URL is provided by the end-user, strip out the tweet ID.
+    if '/' in request:
+        request = request.split('/')[-1]
+    # Collect tweet data -- interrupt if invalid input provided.
+    tweet = client.get_tweets(ids=request,
+                              expansions=['author_id'],
+                              media_fields=None,
+                              place_fields=None,
+                              poll_fields=None,
+                              tweet_fields=['author_id','created_at','public_metrics','source'],
+                              user_fields=['created_at','description','location','public_metrics','url','verified'],
+                              user_auth=False)
+    # STAGE 2. PREPROCESSING OF TWEET DATA
+    # Create dictionaries out of the tweet and user data.
+    for i in tweet.data:
+        tweet_fields = dict(i)
+    for i in tweet.includes['users']:
+        user_fields = dict(i)
+    # Add the data from the user dict to the tweet dict.
+    for key, val in user_fields.items():
+        newKey = "user"+key
+        tweet_fields[newKey] = val
+    # Unpack any values which are themselves dictionaries.
+    additional_values = {}
+    for key, val in tweet_fields.items():
+        if type(val) == dict:
+            for subkey, subval in val.items():
+                additional_values[subkey] = subval
+    tweet_fields.update(additional_values)
+    # Convert everything to a DataFrame.
+    tweet_df = pd.DataFrame(tweet_fields,index=[0])
+    # Store the raw tweet text itself for later use.
+    tweet_text = tweet_df['text'][0]
+    # Store the followers count for later use.
+    followers_count = tweet_df['followers_count'][0]
+    # Preprocess the data using dedicated functions.
+    tweet_df = dataframe_preprocessing(tweet_df)
+    concatenated_tweet_df = feature_concatenation(tweet_df,['text'])
+    # STAGE 3. CALLING CLASSIFIER AND DETERMINING NEXT STEPS
+    # Call the classifier on the tweet.
+    classified_tweet = classify_tweets(concatenated_tweet_df)
+    # If the tweet is misleading, call get_user_tweets and calculate
+    # the user's superspreader score.
+    if len(classified_tweet['badPosts']) == 1:
+        # Fetch a dictionary of classified user tweets
+        classified_user_tweets = get_user_tweets(tweet_df['userid'][0],14,client=client)
+        # Calculate the total number of retweets for all misleading posts.
+        retweets_total = 0
+        for tweet in classified_user_tweets['badPosts']:
+            retweets_total += tweet[-1]
+        # Assign the p (post) value.
+        p = (0.21 * len(classified_user_tweets['badPosts'])) ** 1.13
+        # Assign the f (follower) value
+        f = (0.25 * (np.log10(followers_count+1))) ** 4.73
+        # Assign the r (retweet) value
+        r = (1.04 * (np.log10(retweets_total+1))) ** 0.96
+        # Calculate spreader_score and return a tuple of info.
+        spreader_score = max(((1 - (1/(max(1,p+f+r))))*100),1)
+        return classified_tweet, tweet_text, followers_count, classified_user_tweets, retweets_total, spreader_score,
+    # Otherwise, if tweet is not misleading, return the same info
+    # (excluding any superspreader related variables).
+    elif len(classified_tweet['goodPosts']) == 1:
+        return classified_tweet, tweet_text, followers_count, 0, 0, 0
+    # Contingency in case an error should unexpectedly occur.
+    else:
+        raise Exception("Something went wrong whilst processing tweet data.")
+def webpage():
+    """
+    This function structures the main page of the web app using the
+    conventions of Streamlit. It begins by loading the model, the tokenizer
+    and the Tweepy client using the functions dedicated to those tasks.
+    Each of these elements is then cached. The remaining content that the
+    function generates then depends mostly on the inputs provided by the
+    end-user.
+    Parameters: none.
+    Returns: nothing.
+    """
+    # Create a container for displaying loading messages which will clear
+    # once the tokenizer, Tweepy client and transformer model have loaded.
+    loading_container = st.empty()
+    with loading_container.container():
+        global model
+        model = load_model()
+        global client
+        client = load_client()
+        global tokenizer
+        tokenizer = load_tokenizer()
+    loading_container.empty()
+    # Write header content (e.g. banner image, title, description).
+    st.image("monkeypox-small.jpg")
+    st.title("Monkeypox misinformation detector")
+    st.write("Use this tool to detect whether a tweet contains\
+            monkeypox misinformation and assess the extent to which its\
+            poster can be considered a misinformation superspreader.")
+    st.sidebar.subheader("About")
+    st.sidebar.write("This app has been developed using a\
+                     [COVID-Twitter-BERT](https://huggingface.co/digitalepidemiologylab/covid-twitter-bert-v2)\
+                     model fine-tuned on a monkeypox misinformation\
+                     dataset. Users can learn more about the\
+                     [model](https://www.bbc.co.uk/sport) on the\
+                     HuggingFace model repository and can explore on\
+                     Kaggle the [dataset](https://www.kaggle.com/datasets/stephencrone/monkeypox)\
+                     on which the model was trained. Further\
+                     [documentation](https://www.kaggle.com/datasets/stephencrone/monkeypox),\
+                     as well as the source code for the app, can be\
+                     found in the project's GitHub repository.")
+    st.sidebar.subheader("Contact")
+    st.sidebar.write("If you have any questions, comments or feedback\
+                    regarding this app that are not answered by the\
+                    supporting documentation for the underpinning\
+                    dataset or transformer model, please feel free\
+                    to contact the author at sgscrone@liverpool.ac.uk.")
+    # Provide a text box for user to enter tweet ID / URL.
+    tweet_to_check = st.text_input("Please provide a tweet URL or ID", key="name")
+    # If the string provided by the user is empty, do nothing.
+    if tweet_to_check != "":
+        # Otherwise, if string is not empty, try fetching tweet using function.
+        try:
+            classified_tweet, tweet_text, followers_count, classified_user_tweets, retweets_total, spreader_score = on_receipt_of_tweet_query(tweet_to_check,client)
+            st.markdown("""<hr style="height:1px;border:none;background-color:#a6a6a6; margin-top:16px; margin-bottom:20px;" /> """, unsafe_allow_html=True)
+            col1, col2 = st.columns(2)
+            # In left column, present tweet text.
+            col1.subheader("Tweet")
+            tweet_text = textacy.preprocessing.normalize.whitespace(tweet_text)
+            col1.markdown('<p style="background-color: #F0F2F6; padding: 8px 8px 8px 8px;">{}{}</p>'.format(tweet_text,type(tweet_text)),unsafe_allow_html=True)
+            # In right column, present tweet classification.
+            col2.subheader("Rating for this tweet")
+            if len(classified_tweet['goodPosts']) != 0:
+                # Format blue for not misinformation.
+                col2.markdown('<p style="color:White; background-color: #1661AD; text-align: center; font-size: 20px;">Not misinformation</p>',unsafe_allow_html=True)
+                col2.markdown('<p style="font-size: 40px; text-align: center;">{}</p>'.format(format(classified_tweet['goodPosts'][0][2],'.0%')), unsafe_allow_html=True)
+                col2.markdown('<p style="text-align: center;">confidence level</p>', unsafe_allow_html=True)
+            else:
+                # Format red for misinformation.
+                col2.markdown('<p style="color:White; background-color: #701B20; text-align: center; font-size: 20px;">Misinformation</p>',unsafe_allow_html=True)
+                col2.markdown('<p style="font-size: 40px; text-align: center;">{}</p>'.format(format(classified_tweet['badPosts'][0][2],'.0%')), unsafe_allow_html=True)
+                col2.markdown('<p style="text-align: center;">confidence level</p>', unsafe_allow_html=True)
+                # Add additional container to display superspreader analysis.
+                superspreader_container = st.container()
+                superspreader_container.subheader("Superspreader rating for this user")
+                # Plot the superspreader score as a bar chart.
+                score_to_plot = pd.DataFrame({"classified_tweet":["score"],"spreader_score":[spreader_score]})
+                bar = alt.Chart(score_to_plot).mark_bar().encode(alt.X('spreader_score:Q',scale=alt.Scale(domain=(0, 100)), axis=None), alt.Y('classified_tweet',axis=None)).properties(height=60)
+                if spreader_score > 10:
+                    label = bar.mark_text(align='right',baseline='middle', dx=-10, color='white', fontSize=20).encode(text=alt.Text("spreader_score:Q", format=",.0f"))
+                else:
+                    label = bar.mark_text(align='right',baseline='middle', dx=25, color='black', fontSize=20).encode(text=alt.Text("spreader_score:Q", format=",.0f"))
+                x = bar+label
+                x = x.configure_mark(color='#701B20')
+                superspreader_container.altair_chart(x, use_container_width=True)
+                # Display stats on which calculation was based.
+                superspreader_container.write("Based on the user's **{:,} followers** and the following **{} tweet(s)** published over the last two weeks, which together received **{:,} retweet(s)**.".format(followers_count,len(classified_user_tweets['badPosts']),retweets_total))
+                # And print offending tweets from user's recent history.
+                for i in range(len(classified_user_tweets['badPosts'])):
+                    recent_tweet = classified_user_tweets['badPosts'][i][0]
+                    recent_tweet = recent_tweet.split('text:')[-1]
+                    superspreader_container.markdown('<p style="background-color: #F0F2F6; padding: 8px 8px 8px 8px;">{}</p>'.format(recent_tweet),unsafe_allow_html=True)
+        except:
+            st.error("Could not retrieve information for tweet. Please ensure you are supplying a valid tweet ID or URL.")
+    st.markdown("""<hr style="height:1px;border:none;background-color:#a6a6a6; margin-top:16px; margin-bottom:20px;" /> """, unsafe_allow_html=True)
+if __name__ == "__main__":
+    webpage()