Spaces:
Runtime error
Runtime error
| # Importing required modules | |
| import streamlit as st | |
| from transformers import pipeline | |
| import tensorflow as tf | |
| import numpy as np | |
| import pandas as pd | |
| from tensorflow.keras.layers import TextVectorization | |
| from tensorflow import keras | |
| st.title('Toxic Tweet Classifier') # Header of application | |
| # Allow user to choose model in dropdown menu | |
| modelChoice = st.selectbox('Select a fine-tuned toxicity model to evaluate the tweets below.', | |
| ("Toxicity Model (Trained for 1 epoch)", | |
| "Toxicity Model (Trained for 3 epochs)")) | |
| model = tf.keras.models.load_model('toxicity_model_1_epoch.h5') # The 1-epoch model is chosen by default | |
| if(modelChoice == "Toxicity Model (Trained for 3 epochs)"): # If the user changes their choice to 3-epochs, it is chosen | |
| model = tf.keras.models.load_model('toxicity_model_3_epochs.h5') | |
| dataset = pd.read_csv('train.csv') # Reads .csv of dataset that the models were trained on | |
| tweets = pd.read_csv('tweets.csv') # Reads .csv of dataset (previously test.csv) to use as tweets | |
| comments = dataset['comment_text'] # Training dataset is now referred to as "comments" | |
| tweets = tweets['comment_text'] # Tweets dataset is now referred to as "tweets" | |
| # Vectorizer characteristics | |
| vectorizer = TextVectorization(max_tokens = 2500000, # Vocabulary size set to maximum of 2,500,000 | |
| output_sequence_length = 1800, # Truncate output's dimension to 1800 | |
| output_mode='int') # Outputs integer indices for split string tokens | |
| vectorizer.adapt(comments.values) # Vectorize the comments from the training dataset | |
| highest_classes = [] # Array to store highest-rated toxicity classes for all tweets | |
| highest_class_ratings = [] # Array to store the highest class rating values for all tweets | |
| table_tweets = [] # Array to store tweet contents to use in printing a table to the user | |
| x = 0 # Index initialized as 0 | |
| for tweet in tweets: # For every tweet in the dataset of tweets | |
| if(x < 33): # Restricts loop to first 33 tweets to prevent oversized output | |
| if(len(tweet) < 450): # Filters out tweets that are oversized | |
| input_str = vectorizer(tweet) # Input string is set to the vectorized tweet data | |
| guess = model.predict(np.expand_dims(input_str,0)) # Predict classification values for each tweet | |
| classification = guess[0].tolist() # Assign classification values to a list | |
| # Assign classification values to their respective names | |
| toxicity = classification[0] | |
| toxicity_severe = classification[1] | |
| obscene = classification[2] | |
| threat = classification[3] | |
| insult = classification[4] | |
| identity_hate = classification[5] | |
| highest_class = "Severe toxicity" # Set default highest class as "Severe toxicity" | |
| highest_class_rating = toxicity_severe # Set default highest rating as severe toxicity's rating | |
| # If obscenity has a higher rating, set the highest class and highest rating to it | |
| if(obscene > highest_class_rating): | |
| highest_class = "Obscenity" | |
| highest_class_rating = obscene | |
| # If threat has a higher rating, set the highest class and highest rating to it | |
| if(threat > highest_class_rating): | |
| highest_class = "Threat" | |
| highest_class_rating = threat | |
| # If insult has a higher rating, set the highest class and highest rating to it | |
| if(insult > highest_class_rating): | |
| highest_class = "Insult" | |
| highest_class_rating = insult | |
| # If identity hate has a higher rating, set the highest class and highest rating to it | |
| if(identity_hate > highest_class_rating): | |
| highest_class = "Identity hate" | |
| highest_class_rating = identity_hate | |
| highest_classes.append(highest_class) # Append array with the highest-rated class of the current tweet | |
| highest_class_ratings.append(highest_class_rating) # Append array with the highest rating value of the current tweet | |
| table_tweets.append(tweet) # Append array with contents of the current tweet | |
| x = x + 1 # Increase index value by 1 to arrive at next tweet | |
| # Organize Tweets, highest classes and highest class ratings arrays into a dictionary | |
| data = {'Tweet': table_tweets, | |
| 'Highest Class': highest_classes, | |
| 'Probability': highest_class_ratings} | |
| df = pd.DataFrame(data) # Create a pandas dataframe using the dictionary created above | |
| st.dataframe(df) # Print out table of the dataframe to the user |