JulianHame commited on
Commit
65901e5
·
1 Parent(s): f6fb050

Added many comments for documentation purposes

Browse files
Files changed (1) hide show
  1. app.py +44 -31
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import streamlit as st
2
  from transformers import pipeline
3
  import tensorflow as tf
@@ -6,40 +7,43 @@ import pandas as pd
6
  from tensorflow.keras.layers import TextVectorization
7
  from tensorflow import keras
8
 
9
- st.title('Toxic Tweet Classifier')
10
 
 
11
  modelChoice = st.selectbox('Select a fine-tuned toxicity model to evaluate the tweets below.',
12
  ("Toxicity Model (Trained for 1 epoch)",
13
  "Toxicity Model (Trained for 3 epochs)"))
14
 
15
- model = tf.keras.models.load_model('toxicity_model_1_epoch.h5')
16
- if(modelChoice == "Toxicity Model (Trained for 3 epochs)"):
17
  model = tf.keras.models.load_model('toxicity_model_3_epochs.h5')
18
 
19
- dataset = pd.read_csv('train.csv')
20
- tweets = pd.read_csv('tweets.csv')
21
 
22
- comments = dataset['comment_text']
23
- tweets = tweets['comment_text']
24
 
25
- vectorizer = TextVectorization(max_tokens = 2500000,
26
- output_sequence_length=1800,
27
- output_mode='int')
 
28
 
29
- vectorizer.adapt(comments.values)
30
 
31
- highest_classes = []
32
- highest_class_ratings = []
33
- table_tweets = []
34
 
35
- x = 0
36
- for tweet in tweets:
37
- if(x < 33):
38
- if(len(tweet) < 450):
39
- input_str = vectorizer(tweet)
40
- res = model.predict(np.expand_dims(input_str,0))
41
- classification = res[0].tolist()
42
-
 
43
  toxicity = classification[0]
44
  toxicity_severe = classification[1]
45
  obscene = classification[2]
@@ -47,30 +51,39 @@ for tweet in tweets:
47
  insult = classification[4]
48
  identity_hate = classification[5]
49
 
50
- highest_class = "Severe toxicity"
51
- highest_class_rating = toxicity_severe
 
 
52
  if(obscene > highest_class_rating):
53
  highest_class = "Obscenity"
54
  highest_class_rating = obscene
 
 
55
  if(threat > highest_class_rating):
56
  highest_class = "Threat"
57
  highest_class_rating = threat
 
 
58
  if(insult > highest_class_rating):
59
  highest_class = "Insult"
60
  highest_class_rating = insult
 
 
61
  if(identity_hate > highest_class_rating):
62
  highest_class = "Identity hate"
63
  highest_class_rating = identity_hate
64
-
65
- highest_classes.append(highest_class)
66
- highest_class_ratings.append(highest_class_rating)
67
- table_tweets.append(tweet)
68
- x = x + 1
69
 
 
 
 
 
 
 
70
  data = {'Tweet': table_tweets,
71
  'Highest Class': highest_classes,
72
  'Probability': highest_class_ratings}
73
 
74
 
75
- df = pd.DataFrame(data)
76
- st.dataframe(df)
 
1
+ # Importing required modules
2
  import streamlit as st
3
  from transformers import pipeline
4
  import tensorflow as tf
 
7
  from tensorflow.keras.layers import TextVectorization
8
  from tensorflow import keras
9
 
10
+ st.title('Toxic Tweet Classifier') # Header of application
11
 
12
+ # Allow user to choose model in dropdown menu
13
  modelChoice = st.selectbox('Select a fine-tuned toxicity model to evaluate the tweets below.',
14
  ("Toxicity Model (Trained for 1 epoch)",
15
  "Toxicity Model (Trained for 3 epochs)"))
16
 
17
+ model = tf.keras.models.load_model('toxicity_model_1_epoch.h5') # The 1-epoch model is chosen by default
18
+ if(modelChoice == "Toxicity Model (Trained for 3 epochs)"): # If the user changes their choice to 3-epochs, it is chosen
19
  model = tf.keras.models.load_model('toxicity_model_3_epochs.h5')
20
 
21
+ dataset = pd.read_csv('train.csv') # Reads .csv of dataset that the models were trained on
22
+ tweets = pd.read_csv('tweets.csv') # Reads .csv of dataset (previously test.csv) to use as tweets
23
 
24
+ comments = dataset['comment_text'] # Training dataset is now referred to as "comments"
25
+ tweets = tweets['comment_text'] # Tweets dataset is now referred to as "tweets"
26
 
27
+ # Vectorizer characteristics
28
+ vectorizer = TextVectorization(max_tokens = 2500000, # Vocabulary size set to maximum of 2,500,000
29
+ output_sequence_length = 1800, # Truncate output's dimension to 1800
30
+ output_mode='int') # Outputs integer indices for split string tokens
31
 
32
+ vectorizer.adapt(comments.values) # Vectorize the comments from the training dataset
33
 
34
+ highest_classes = [] # Array to store highest-rated toxicity classes for all tweets
35
+ highest_class_ratings = [] # Array to store the highest class rating values for all tweets
36
+ table_tweets = [] # Array to store tweet contents to use in printing a table to the user
37
 
38
+ x = 0 # Index initialized as 0
39
+ for tweet in tweets: # For every tweet in the dataset of tweets
40
+ if(x < 33): # Restricts loop to first 33 tweets to prevent oversized output
41
+ if(len(tweet) < 450): # Filters out tweets that are oversized
42
+ input_str = vectorizer(tweet) # Input string is set to the vectorized tweet data
43
+ guess = model.predict(np.expand_dims(input_str,0)) # Predict classification values for each tweet
44
+ classification = guess[0].tolist() # Assign classification values to a list
45
+
46
+ # Assign classification values to their respective names
47
  toxicity = classification[0]
48
  toxicity_severe = classification[1]
49
  obscene = classification[2]
 
51
  insult = classification[4]
52
  identity_hate = classification[5]
53
 
54
+ highest_class = "Severe toxicity" # Set default highest class as "Severe toxicity"
55
+ highest_class_rating = toxicity_severe # Set default highest rating as severe toxicity's rating
56
+
57
+ # If obscenity has a higher rating, set the highest class and highest rating to it
58
  if(obscene > highest_class_rating):
59
  highest_class = "Obscenity"
60
  highest_class_rating = obscene
61
+
62
+ # If threat has a higher rating, set the highest class and highest rating to it
63
  if(threat > highest_class_rating):
64
  highest_class = "Threat"
65
  highest_class_rating = threat
66
+
67
+ # If insult has a higher rating, set the highest class and highest rating to it
68
  if(insult > highest_class_rating):
69
  highest_class = "Insult"
70
  highest_class_rating = insult
71
+
72
+ # If identity hate has a higher rating, set the highest class and highest rating to it
73
  if(identity_hate > highest_class_rating):
74
  highest_class = "Identity hate"
75
  highest_class_rating = identity_hate
 
 
 
 
 
76
 
77
+ highest_classes.append(highest_class) # Append array with the highest-rated class of the current tweet
78
+ highest_class_ratings.append(highest_class_rating) # Append array with the highest rating value of the current tweet
79
+ table_tweets.append(tweet) # Append array with contents of the current tweet
80
+ x = x + 1 # Increase index value by 1 to arrive at next tweet
81
+
82
+ # Organize Tweets, highest classes and highest class ratings arrays into a dictionary
83
  data = {'Tweet': table_tweets,
84
  'Highest Class': highest_classes,
85
  'Probability': highest_class_ratings}
86
 
87
 
88
+ df = pd.DataFrame(data) # Create a pandas dataframe using the dictionary created above
89
+ st.dataframe(df) # Print out table of the dataframe to the user