# Importing required modules import streamlit as st from transformers import pipeline import tensorflow as tf import numpy as np import pandas as pd from tensorflow.keras.layers import TextVectorization from tensorflow import keras st.title('Toxic Tweet Classifier') # Header of application # Allow user to choose model in dropdown menu modelChoice = st.selectbox('Select a fine-tuned toxicity model to evaluate the tweets below.', ("Toxicity Model (Trained for 1 epoch)", "Toxicity Model (Trained for 3 epochs)")) model = tf.keras.models.load_model('toxicity_model_1_epoch.h5') # The 1-epoch model is chosen by default if(modelChoice == "Toxicity Model (Trained for 3 epochs)"): # If the user changes their choice to 3-epochs, it is chosen model = tf.keras.models.load_model('toxicity_model_3_epochs.h5') dataset = pd.read_csv('train.csv') # Reads .csv of dataset that the models were trained on tweets = pd.read_csv('tweets.csv') # Reads .csv of dataset (previously test.csv) to use as tweets comments = dataset['comment_text'] # Training dataset is now referred to as "comments" tweets = tweets['comment_text'] # Tweets dataset is now referred to as "tweets" # Vectorizer characteristics vectorizer = TextVectorization(max_tokens = 2500000, # Vocabulary size set to maximum of 2,500,000 output_sequence_length = 1800, # Truncate output's dimension to 1800 output_mode='int') # Outputs integer indices for split string tokens vectorizer.adapt(comments.values) # Vectorize the comments from the training dataset highest_classes = [] # Array to store highest-rated toxicity classes for all tweets highest_class_ratings = [] # Array to store the highest class rating values for all tweets table_tweets = [] # Array to store tweet contents to use in printing a table to the user x = 0 # Index initialized as 0 for tweet in tweets: # For every tweet in the dataset of tweets if(x < 33): # Restricts loop to first 33 tweets to prevent oversized output if(len(tweet) < 450): # Filters out tweets that are oversized input_str = vectorizer(tweet) # Input string is set to the vectorized tweet data guess = model.predict(np.expand_dims(input_str,0)) # Predict classification values for each tweet classification = guess[0].tolist() # Assign classification values to a list # Assign classification values to their respective names toxicity = classification[0] toxicity_severe = classification[1] obscene = classification[2] threat = classification[3] insult = classification[4] identity_hate = classification[5] highest_class = "Severe toxicity" # Set default highest class as "Severe toxicity" highest_class_rating = toxicity_severe # Set default highest rating as severe toxicity's rating # If obscenity has a higher rating, set the highest class and highest rating to it if(obscene > highest_class_rating): highest_class = "Obscenity" highest_class_rating = obscene # If threat has a higher rating, set the highest class and highest rating to it if(threat > highest_class_rating): highest_class = "Threat" highest_class_rating = threat # If insult has a higher rating, set the highest class and highest rating to it if(insult > highest_class_rating): highest_class = "Insult" highest_class_rating = insult # If identity hate has a higher rating, set the highest class and highest rating to it if(identity_hate > highest_class_rating): highest_class = "Identity hate" highest_class_rating = identity_hate highest_classes.append(highest_class) # Append array with the highest-rated class of the current tweet highest_class_ratings.append(highest_class_rating) # Append array with the highest rating value of the current tweet table_tweets.append(tweet) # Append array with contents of the current tweet x = x + 1 # Increase index value by 1 to arrive at next tweet # Organize Tweets, highest classes and highest class ratings arrays into a dictionary data = {'Tweet': table_tweets, 'Highest Class': highest_classes, 'Probability': highest_class_ratings} df = pd.DataFrame(data) # Create a pandas dataframe using the dictionary created above st.dataframe(df) # Print out table of the dataframe to the user