Spaces:
Runtime error
Runtime error
JulianHame
commited on
Commit
·
65901e5
1
Parent(s):
f6fb050
Added many comments for documentation purposes
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
from transformers import pipeline
|
3 |
import tensorflow as tf
|
@@ -6,40 +7,43 @@ import pandas as pd
|
|
6 |
from tensorflow.keras.layers import TextVectorization
|
7 |
from tensorflow import keras
|
8 |
|
9 |
-
st.title('Toxic Tweet Classifier')
|
10 |
|
|
|
11 |
modelChoice = st.selectbox('Select a fine-tuned toxicity model to evaluate the tweets below.',
|
12 |
("Toxicity Model (Trained for 1 epoch)",
|
13 |
"Toxicity Model (Trained for 3 epochs)"))
|
14 |
|
15 |
-
model = tf.keras.models.load_model('toxicity_model_1_epoch.h5')
|
16 |
-
if(modelChoice == "Toxicity Model (Trained for 3 epochs)"):
|
17 |
model = tf.keras.models.load_model('toxicity_model_3_epochs.h5')
|
18 |
|
19 |
-
dataset = pd.read_csv('train.csv')
|
20 |
-
tweets = pd.read_csv('tweets.csv')
|
21 |
|
22 |
-
comments = dataset['comment_text']
|
23 |
-
tweets = tweets['comment_text']
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
28 |
|
29 |
-
vectorizer.adapt(comments.values)
|
30 |
|
31 |
-
highest_classes = []
|
32 |
-
highest_class_ratings = []
|
33 |
-
table_tweets = []
|
34 |
|
35 |
-
x = 0
|
36 |
-
for tweet in tweets:
|
37 |
-
if(x < 33):
|
38 |
-
if(len(tweet) < 450):
|
39 |
-
input_str = vectorizer(tweet)
|
40 |
-
|
41 |
-
classification =
|
42 |
-
|
|
|
43 |
toxicity = classification[0]
|
44 |
toxicity_severe = classification[1]
|
45 |
obscene = classification[2]
|
@@ -47,30 +51,39 @@ for tweet in tweets:
|
|
47 |
insult = classification[4]
|
48 |
identity_hate = classification[5]
|
49 |
|
50 |
-
highest_class = "Severe toxicity"
|
51 |
-
highest_class_rating = toxicity_severe
|
|
|
|
|
52 |
if(obscene > highest_class_rating):
|
53 |
highest_class = "Obscenity"
|
54 |
highest_class_rating = obscene
|
|
|
|
|
55 |
if(threat > highest_class_rating):
|
56 |
highest_class = "Threat"
|
57 |
highest_class_rating = threat
|
|
|
|
|
58 |
if(insult > highest_class_rating):
|
59 |
highest_class = "Insult"
|
60 |
highest_class_rating = insult
|
|
|
|
|
61 |
if(identity_hate > highest_class_rating):
|
62 |
highest_class = "Identity hate"
|
63 |
highest_class_rating = identity_hate
|
64 |
-
|
65 |
-
highest_classes.append(highest_class)
|
66 |
-
highest_class_ratings.append(highest_class_rating)
|
67 |
-
table_tweets.append(tweet)
|
68 |
-
x = x + 1
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
data = {'Tweet': table_tweets,
|
71 |
'Highest Class': highest_classes,
|
72 |
'Probability': highest_class_ratings}
|
73 |
|
74 |
|
75 |
-
df = pd.DataFrame(data)
|
76 |
-
st.dataframe(df)
|
|
|
1 |
+
# Importing required modules
|
2 |
import streamlit as st
|
3 |
from transformers import pipeline
|
4 |
import tensorflow as tf
|
|
|
7 |
from tensorflow.keras.layers import TextVectorization
|
8 |
from tensorflow import keras
|
9 |
|
10 |
+
st.title('Toxic Tweet Classifier') # Header of application
|
11 |
|
12 |
+
# Allow user to choose model in dropdown menu
|
13 |
modelChoice = st.selectbox('Select a fine-tuned toxicity model to evaluate the tweets below.',
|
14 |
("Toxicity Model (Trained for 1 epoch)",
|
15 |
"Toxicity Model (Trained for 3 epochs)"))
|
16 |
|
17 |
+
model = tf.keras.models.load_model('toxicity_model_1_epoch.h5') # The 1-epoch model is chosen by default
|
18 |
+
if(modelChoice == "Toxicity Model (Trained for 3 epochs)"): # If the user changes their choice to 3-epochs, it is chosen
|
19 |
model = tf.keras.models.load_model('toxicity_model_3_epochs.h5')
|
20 |
|
21 |
+
dataset = pd.read_csv('train.csv') # Reads .csv of dataset that the models were trained on
|
22 |
+
tweets = pd.read_csv('tweets.csv') # Reads .csv of dataset (previously test.csv) to use as tweets
|
23 |
|
24 |
+
comments = dataset['comment_text'] # Training dataset is now referred to as "comments"
|
25 |
+
tweets = tweets['comment_text'] # Tweets dataset is now referred to as "tweets"
|
26 |
|
27 |
+
# Vectorizer characteristics
|
28 |
+
vectorizer = TextVectorization(max_tokens = 2500000, # Vocabulary size set to maximum of 2,500,000
|
29 |
+
output_sequence_length = 1800, # Truncate output's dimension to 1800
|
30 |
+
output_mode='int') # Outputs integer indices for split string tokens
|
31 |
|
32 |
+
vectorizer.adapt(comments.values) # Vectorize the comments from the training dataset
|
33 |
|
34 |
+
highest_classes = [] # Array to store highest-rated toxicity classes for all tweets
|
35 |
+
highest_class_ratings = [] # Array to store the highest class rating values for all tweets
|
36 |
+
table_tweets = [] # Array to store tweet contents to use in printing a table to the user
|
37 |
|
38 |
+
x = 0 # Index initialized as 0
|
39 |
+
for tweet in tweets: # For every tweet in the dataset of tweets
|
40 |
+
if(x < 33): # Restricts loop to first 33 tweets to prevent oversized output
|
41 |
+
if(len(tweet) < 450): # Filters out tweets that are oversized
|
42 |
+
input_str = vectorizer(tweet) # Input string is set to the vectorized tweet data
|
43 |
+
guess = model.predict(np.expand_dims(input_str,0)) # Predict classification values for each tweet
|
44 |
+
classification = guess[0].tolist() # Assign classification values to a list
|
45 |
+
|
46 |
+
# Assign classification values to their respective names
|
47 |
toxicity = classification[0]
|
48 |
toxicity_severe = classification[1]
|
49 |
obscene = classification[2]
|
|
|
51 |
insult = classification[4]
|
52 |
identity_hate = classification[5]
|
53 |
|
54 |
+
highest_class = "Severe toxicity" # Set default highest class as "Severe toxicity"
|
55 |
+
highest_class_rating = toxicity_severe # Set default highest rating as severe toxicity's rating
|
56 |
+
|
57 |
+
# If obscenity has a higher rating, set the highest class and highest rating to it
|
58 |
if(obscene > highest_class_rating):
|
59 |
highest_class = "Obscenity"
|
60 |
highest_class_rating = obscene
|
61 |
+
|
62 |
+
# If threat has a higher rating, set the highest class and highest rating to it
|
63 |
if(threat > highest_class_rating):
|
64 |
highest_class = "Threat"
|
65 |
highest_class_rating = threat
|
66 |
+
|
67 |
+
# If insult has a higher rating, set the highest class and highest rating to it
|
68 |
if(insult > highest_class_rating):
|
69 |
highest_class = "Insult"
|
70 |
highest_class_rating = insult
|
71 |
+
|
72 |
+
# If identity hate has a higher rating, set the highest class and highest rating to it
|
73 |
if(identity_hate > highest_class_rating):
|
74 |
highest_class = "Identity hate"
|
75 |
highest_class_rating = identity_hate
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
+
highest_classes.append(highest_class) # Append array with the highest-rated class of the current tweet
|
78 |
+
highest_class_ratings.append(highest_class_rating) # Append array with the highest rating value of the current tweet
|
79 |
+
table_tweets.append(tweet) # Append array with contents of the current tweet
|
80 |
+
x = x + 1 # Increase index value by 1 to arrive at next tweet
|
81 |
+
|
82 |
+
# Organize Tweets, highest classes and highest class ratings arrays into a dictionary
|
83 |
data = {'Tweet': table_tweets,
|
84 |
'Highest Class': highest_classes,
|
85 |
'Probability': highest_class_ratings}
|
86 |
|
87 |
|
88 |
+
df = pd.DataFrame(data) # Create a pandas dataframe using the dictionary created above
|
89 |
+
st.dataframe(df) # Print out table of the dataframe to the user
|