toxic-tweet-classifier

Runtime error

App Files Files Community

JulianHame commited on Apr 30, 2023

Commit

65901e5

1 Parent(s): f6fb050

Added many comments for documentation purposes

Browse files

Files changed (1) hide show

app.py +44 -31

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import streamlit as st
 from transformers import pipeline
 import tensorflow as tf
@@ -6,40 +7,43 @@ import pandas as pd
 from tensorflow.keras.layers import TextVectorization
 from tensorflow import keras
-st.title('Toxic Tweet Classifier')
 modelChoice = st.selectbox('Select a fine-tuned toxicity model to evaluate the tweets below.',
                            ("Toxicity Model (Trained for 1 epoch)",
                            "Toxicity Model (Trained for 3 epochs)"))
-model = tf.keras.models.load_model('toxicity_model_1_epoch.h5')
-if(modelChoice == "Toxicity Model (Trained for 3 epochs)"):
     model = tf.keras.models.load_model('toxicity_model_3_epochs.h5')
-dataset = pd.read_csv('train.csv')
-tweets = pd.read_csv('tweets.csv')
-comments = dataset['comment_text']
-tweets = tweets['comment_text']
-vectorizer = TextVectorization(max_tokens = 2500000,
-                               output_sequence_length=1800,
-                               output_mode='int')
-vectorizer.adapt(comments.values)
-highest_classes = []
-highest_class_ratings = []
-table_tweets = []
-x = 0
-for tweet in tweets:
-    if(x < 33):
-        if(len(tweet) < 450):
-            input_str = vectorizer(tweet)
-            res = model.predict(np.expand_dims(input_str,0))
-            classification = res[0].tolist()
             toxicity = classification[0]
             toxicity_severe = classification[1]
             obscene = classification[2]
@@ -47,30 +51,39 @@ for tweet in tweets:
             insult = classification[4]
             identity_hate = classification[5]
-            highest_class = "Severe toxicity"
-            highest_class_rating = toxicity_severe
             if(obscene > highest_class_rating):
                 highest_class = "Obscenity"
                 highest_class_rating = obscene
             if(threat > highest_class_rating):
                 highest_class = "Threat"
                 highest_class_rating = threat
             if(insult > highest_class_rating):
                 highest_class = "Insult"
                 highest_class_rating = insult
             if(identity_hate > highest_class_rating):
                 highest_class = "Identity hate"
                 highest_class_rating = identity_hate
-            highest_classes.append(highest_class)
-            highest_class_ratings.append(highest_class_rating)
-            table_tweets.append(tweet)
-            x = x + 1
 data = {'Tweet': table_tweets,
         'Highest Class': highest_classes,
         'Probability': highest_class_ratings}
-df = pd.DataFrame(data)
-st.dataframe(df)

+# Importing required modules
 import streamlit as st
 from transformers import pipeline
 import tensorflow as tf
 from tensorflow.keras.layers import TextVectorization
 from tensorflow import keras
+st.title('Toxic Tweet Classifier') # Header of application
+# Allow user to choose model in dropdown menu
 modelChoice = st.selectbox('Select a fine-tuned toxicity model to evaluate the tweets below.',
                            ("Toxicity Model (Trained for 1 epoch)",
                            "Toxicity Model (Trained for 3 epochs)"))
+model = tf.keras.models.load_model('toxicity_model_1_epoch.h5') # The 1-epoch model is chosen by default
+if(modelChoice == "Toxicity Model (Trained for 3 epochs)"):     # If the user changes their choice to 3-epochs, it is chosen
     model = tf.keras.models.load_model('toxicity_model_3_epochs.h5')
+dataset = pd.read_csv('train.csv') # Reads .csv of dataset that the models were trained on
+tweets = pd.read_csv('tweets.csv') # Reads .csv of dataset (previously test.csv) to use as tweets
+comments = dataset['comment_text'] # Training dataset is now referred to as "comments"
+tweets = tweets['comment_text']    # Tweets dataset is now referred to as "tweets"
+# Vectorizer characteristics
+vectorizer = TextVectorization(max_tokens = 2500000,            # Vocabulary size set to maximum of 2,500,000
+                               output_sequence_length = 1800,   # Truncate output's dimension to 1800
+                               output_mode='int')               # Outputs integer indices for split string tokens
+vectorizer.adapt(comments.values)    # Vectorize the comments from the training dataset
+highest_classes = []          # Array to store highest-rated toxicity classes for all tweets
+highest_class_ratings = []    # Array to store the highest class rating values for all tweets
+table_tweets = []             # Array to store tweet contents to use in printing a table to the user
+x = 0                                                            # Index initialized as 0
+for tweet in tweets:                                             # For every tweet in the dataset of tweets
+    if(x < 33):                                                  # Restricts loop to first 33 tweets to prevent oversized output
+        if(len(tweet) < 450):                                    # Filters out tweets that are oversized
+            input_str = vectorizer(tweet)                        # Input string is set to the vectorized tweet data
+            guess = model.predict(np.expand_dims(input_str,0))   # Predict classification values for each tweet
+            classification = guess[0].tolist()                   # Assign classification values to a list
+            # Assign classification values to their respective names
             toxicity = classification[0]
             toxicity_severe = classification[1]
             obscene = classification[2]
             insult = classification[4]
             identity_hate = classification[5]
+            highest_class = "Severe toxicity"           # Set default highest class as "Severe toxicity"
+            highest_class_rating = toxicity_severe      # Set default highest rating as severe toxicity's rating
+            # If obscenity has a higher rating, set the highest class and highest rating to it
             if(obscene > highest_class_rating):
                 highest_class = "Obscenity"
                 highest_class_rating = obscene
+            # If threat has a higher rating, set the highest class and highest rating to it
             if(threat > highest_class_rating):
                 highest_class = "Threat"
                 highest_class_rating = threat
+            # If insult has a higher rating, set the highest class and highest rating to it
             if(insult > highest_class_rating):
                 highest_class = "Insult"
                 highest_class_rating = insult
+            # If identity hate has a higher rating, set the highest class and highest rating to it
             if(identity_hate > highest_class_rating):
                 highest_class = "Identity hate"
                 highest_class_rating = identity_hate
+            highest_classes.append(highest_class)                 # Append array with the highest-rated class of the current tweet
+            highest_class_ratings.append(highest_class_rating)    # Append array with the highest rating value of the current tweet
+            table_tweets.append(tweet)                            # Append array with contents of the current tweet
+            x = x + 1                                             # Increase index value by 1 to arrive at next tweet
+# Organize Tweets, highest classes and highest class ratings arrays into a dictionary
 data = {'Tweet': table_tweets,
         'Highest Class': highest_classes,
         'Probability': highest_class_ratings}
+df = pd.DataFrame(data)    # Create a pandas dataframe using the dictionary created above
+st.dataframe(df)           # Print out table of the dataframe to the user