Spaces:
Runtime error
Runtime error
# Importing required modules | |
import streamlit as st | |
from transformers import pipeline | |
import tensorflow as tf | |
import numpy as np | |
import pandas as pd | |
from tensorflow.keras.layers import TextVectorization | |
from tensorflow import keras | |
st.title('Toxic Tweet Classifier') # Header of application | |
# Allow user to choose model in dropdown menu | |
modelChoice = st.selectbox('Select a fine-tuned toxicity model to evaluate the tweets below.', | |
("Toxicity Model (Trained for 1 epoch)", | |
"Toxicity Model (Trained for 3 epochs)")) | |
model = tf.keras.models.load_model('toxicity_model_1_epoch.h5') # The 1-epoch model is chosen by default | |
if(modelChoice == "Toxicity Model (Trained for 3 epochs)"): # If the user changes their choice to 3-epochs, it is chosen | |
model = tf.keras.models.load_model('toxicity_model_3_epochs.h5') | |
dataset = pd.read_csv('train.csv') # Reads .csv of dataset that the models were trained on | |
tweets = pd.read_csv('tweets.csv') # Reads .csv of dataset (previously test.csv) to use as tweets | |
comments = dataset['comment_text'] # Training dataset is now referred to as "comments" | |
tweets = tweets['comment_text'] # Tweets dataset is now referred to as "tweets" | |
# Vectorizer characteristics | |
vectorizer = TextVectorization(max_tokens = 2500000, # Vocabulary size set to maximum of 2,500,000 | |
output_sequence_length = 1800, # Truncate output's dimension to 1800 | |
output_mode='int') # Outputs integer indices for split string tokens | |
vectorizer.adapt(comments.values) # Vectorize the comments from the training dataset | |
highest_classes = [] # Array to store highest-rated toxicity classes for all tweets | |
highest_class_ratings = [] # Array to store the highest class rating values for all tweets | |
table_tweets = [] # Array to store tweet contents to use in printing a table to the user | |
x = 0 # Index initialized as 0 | |
for tweet in tweets: # For every tweet in the dataset of tweets | |
if(x < 33): # Restricts loop to first 33 tweets to prevent oversized output | |
if(len(tweet) < 450): # Filters out tweets that are oversized | |
input_str = vectorizer(tweet) # Input string is set to the vectorized tweet data | |
guess = model.predict(np.expand_dims(input_str,0)) # Predict classification values for each tweet | |
classification = guess[0].tolist() # Assign classification values to a list | |
# Assign classification values to their respective names | |
toxicity = classification[0] | |
toxicity_severe = classification[1] | |
obscene = classification[2] | |
threat = classification[3] | |
insult = classification[4] | |
identity_hate = classification[5] | |
highest_class = "Severe toxicity" # Set default highest class as "Severe toxicity" | |
highest_class_rating = toxicity_severe # Set default highest rating as severe toxicity's rating | |
# If obscenity has a higher rating, set the highest class and highest rating to it | |
if(obscene > highest_class_rating): | |
highest_class = "Obscenity" | |
highest_class_rating = obscene | |
# If threat has a higher rating, set the highest class and highest rating to it | |
if(threat > highest_class_rating): | |
highest_class = "Threat" | |
highest_class_rating = threat | |
# If insult has a higher rating, set the highest class and highest rating to it | |
if(insult > highest_class_rating): | |
highest_class = "Insult" | |
highest_class_rating = insult | |
# If identity hate has a higher rating, set the highest class and highest rating to it | |
if(identity_hate > highest_class_rating): | |
highest_class = "Identity hate" | |
highest_class_rating = identity_hate | |
highest_classes.append(highest_class) # Append array with the highest-rated class of the current tweet | |
highest_class_ratings.append(highest_class_rating) # Append array with the highest rating value of the current tweet | |
table_tweets.append(tweet) # Append array with contents of the current tweet | |
x = x + 1 # Increase index value by 1 to arrive at next tweet | |
# Organize Tweets, highest classes and highest class ratings arrays into a dictionary | |
data = {'Tweet': table_tweets, | |
'Highest Class': highest_classes, | |
'Probability': highest_class_ratings} | |
df = pd.DataFrame(data) # Create a pandas dataframe using the dictionary created above | |
st.dataframe(df) # Print out table of the dataframe to the user |