cooldragon12
commited on
Commit
β’
f67c2de
1
Parent(s):
3d200ef
Added Multilingual BERT Base
Browse files- .gitignore +1 -2
- README.md +3 -0
- app.py +39 -34
- model_with_bert_multilingual.h5 +3 -0
- pipeline/model/__init__.py +1 -2
- pipeline/model/__pycache__/__init__.cpython-311.pyc +0 -0
- pipeline/model/model_with_bert_multilingual.py +25 -1
- pipeline/preprocessing/__init__.py +3 -30
- pipeline/preprocessing/__pycache__/__init__.cpython-311.pyc +0 -0
- pipeline/preprocessing/constants.py +18 -0
- pipeline/preprocessing/decoder.py +14 -0
- pipeline/preprocessing/tokenizer.py +15 -0
.gitignore
CHANGED
@@ -1,3 +1,2 @@
|
|
1 |
**/__pycache__/
|
2 |
-
**venv/
|
3 |
-
model_with_bert_multilingual.h5
|
|
|
1 |
**/__pycache__/
|
2 |
+
**venv/
|
|
README.md
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
---
|
2 |
title: Multitask Classifying Emotion Toxicity Valorant Chat
|
|
|
3 |
emoji: π
|
4 |
colorFrom: blue
|
5 |
colorTo: purple
|
@@ -10,4 +11,6 @@ pinned: false
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
|
|
|
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Multitask Classifying Emotion Toxicity Valorant Chat
|
3 |
+
summary: This space contains a Streamlit app that classifies the toxicity and emotion of Valorant chat messages. From the model of ChattyTicket.
|
4 |
emoji: π
|
5 |
colorFrom: blue
|
6 |
colorTo: purple
|
|
|
11 |
license: mit
|
12 |
---
|
13 |
|
14 |
+
|
15 |
+
|
16 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,66 +1,71 @@
|
|
1 |
import streamlit as st
|
2 |
-
|
3 |
from pipeline.model import MultiTaskModel
|
4 |
-
from pipeline.preprocessing import Preprocessor
|
5 |
|
6 |
# Load the model
|
7 |
-
|
8 |
-
"Angry",
|
9 |
-
"Disgust",
|
10 |
-
"Happy",
|
11 |
-
"Neutral",
|
12 |
-
"Sad",
|
13 |
-
"Surprise",
|
14 |
-
)
|
15 |
-
TOXICITY_CHOICES = (
|
16 |
-
"Blaming Others",
|
17 |
-
"Cyberbullying",
|
18 |
-
"Gameplay Experience Complaints",
|
19 |
-
"Gamesplaining",
|
20 |
-
"Multiple Discrimination",
|
21 |
-
"Not Toxic",
|
22 |
-
"Sarcasm",
|
23 |
-
)
|
24 |
|
25 |
-
st.
|
26 |
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
)
|
30 |
|
31 |
st.table(
|
32 |
{
|
33 |
-
"Emotion":
|
34 |
-
"Toxicity":
|
35 |
}
|
36 |
)
|
37 |
|
38 |
@st.cache_resource
|
39 |
-
def loading_model():
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
-
|
|
|
43 |
|
44 |
# Get user input
|
45 |
user_input = st.text_input("Enter a Valorant chat message:")
|
46 |
st.write("You entered:", user_input)
|
47 |
-
|
48 |
# Predict
|
49 |
prediction = model.predict(user_input)
|
50 |
emotions, toxicitys = prediction
|
|
|
51 |
|
52 |
col1, col2 = st.columns(2)
|
53 |
|
54 |
with col1:
|
|
|
55 |
for i, emotion in enumerate(emotions[0]):
|
56 |
-
st.write(f"{
|
57 |
st.progress(float(emotion))
|
58 |
with col2:
|
|
|
59 |
for i, toxicity in enumerate(toxicitys[0]):
|
60 |
-
st.write(f"{
|
61 |
st.progress(float(toxicity))
|
62 |
|
63 |
-
|
64 |
-
# Display the prediction
|
65 |
-
st.write("The predicted emotion is:", decoded[0][0])
|
66 |
-
st.write("The predicted toxicity is:", decoded[1][0])
|
|
|
1 |
import streamlit as st
|
|
|
2 |
from pipeline.model import MultiTaskModel
|
3 |
+
from pipeline.preprocessing import Preprocessor, EMOTION_LABELS, TOXICITY_LABELS
|
4 |
|
5 |
# Load the model
|
6 |
+
st.title("ChattyTicket Model: Emotion and Toxicity Classification of Valorant chat messages")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
st.write("""Discover our innovative model designed to tackle toxicity in Valorant, a popular multiplayer game. Leveraging advanced multi-task learning, our model combines Bi-LSTM and BERT architectures for superior performance. This approach enables accurate classification of toxic conversations and emotional content within the game.
|
9 |
|
10 |
+
Key Features:
|
11 |
+
- Multi-Task Learning: Simultaneous classification of toxicity and emotion.
|
12 |
+
- Bi-LSTM & BERT Integration: Enhanced accuracy with a BERT pre-trained backbone.
|
13 |
+
- High Accuracy: 91.81% in toxicity detection and 86.74% in emotion prediction.
|
14 |
+
Insights & Impact:
|
15 |
+
- Emotional Landscape: Identifies prevalent emotions like anger and instances of cyberbullying.
|
16 |
+
- Healthier Communities: Provides insights for fostering positive gaming environments.
|
17 |
+
ChattyTicket API:
|
18 |
+
- Evaluate chat text for emotion and toxicity with our user-friendly API. Positive user feedback highlights its effectiveness and interactivity, with ongoing improvements based on user input to better handle nuances like sarcasm.
|
19 |
+
|
20 |
+
Experience the future of online gaming moderation with our model and contribute to a healthier, more enjoyable gaming community.
|
21 |
+
"""
|
22 |
)
|
23 |
|
24 |
st.table(
|
25 |
{
|
26 |
+
"Emotion": EMOTION_LABELS,
|
27 |
+
"Toxicity": TOXICITY_LABELS,
|
28 |
}
|
29 |
)
|
30 |
|
31 |
@st.cache_resource
|
32 |
+
def loading_model(bert_base):
|
33 |
+
if bert_base == "bert-base-uncased":
|
34 |
+
print("Loading base model")
|
35 |
+
return MultiTaskModel(preprocessor=Preprocessor())
|
36 |
+
elif bert_base == "bert-base-multilingual-cased":
|
37 |
+
print("Loading multilingual model")
|
38 |
+
return MultiTaskModel(is_multilingual=True, preprocessor=Preprocessor(is_multilingual=True))
|
39 |
+
else:
|
40 |
+
return None
|
41 |
+
|
42 |
+
def clear():
|
43 |
+
loading_model.clear()
|
44 |
+
|
45 |
+
bert_base = st.selectbox("Select a model", ("bert-base-uncased", "bert-base-multilingual-cased"), placeholder="Select a model", on_change=clear)
|
46 |
|
47 |
+
|
48 |
+
model = loading_model(bert_base)
|
49 |
|
50 |
# Get user input
|
51 |
user_input = st.text_input("Enter a Valorant chat message:")
|
52 |
st.write("You entered:", user_input)
|
|
|
53 |
# Predict
|
54 |
prediction = model.predict(user_input)
|
55 |
emotions, toxicitys = prediction
|
56 |
+
decoded = model.decode(prediction)
|
57 |
|
58 |
col1, col2 = st.columns(2)
|
59 |
|
60 |
with col1:
|
61 |
+
st.write(f"The predicted emotion is: {decoded[0][0][0]}" )
|
62 |
for i, emotion in enumerate(emotions[0]):
|
63 |
+
st.write(f"{EMOTION_LABELS[i]}: {(emotion*100):.2f}%")
|
64 |
st.progress(float(emotion))
|
65 |
with col2:
|
66 |
+
st.write(f"The predicted toxicity is: {decoded[1][0][0]} ")
|
67 |
for i, toxicity in enumerate(toxicitys[0]):
|
68 |
+
st.write(f"{TOXICITY_LABELS[i]}: {(toxicity*100):.2f}%")
|
69 |
st.progress(float(toxicity))
|
70 |
|
71 |
+
|
|
|
|
|
|
model_with_bert_multilingual.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50a1ae46552fba31db5adea05b4edc300eb9c22a3932fbbf44e7ebeeaa883774
|
3 |
+
size 2141328712
|
pipeline/model/__init__.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
import tensorflow as tf
|
2 |
-
|
3 |
from tensorflow.keras.models import load_model
|
4 |
from transformers import TFBertModel
|
5 |
|
@@ -21,6 +19,7 @@ class MultiTaskModel:
|
|
21 |
|
22 |
def predict(self, text):
|
23 |
preptext= self.preprocessor.preprocess_text(text)
|
|
|
24 |
return self.model.predict(preptext)
|
25 |
|
26 |
def decode(self, pred):
|
|
|
|
|
|
|
1 |
from tensorflow.keras.models import load_model
|
2 |
from transformers import TFBertModel
|
3 |
|
|
|
19 |
|
20 |
def predict(self, text):
|
21 |
preptext= self.preprocessor.preprocess_text(text)
|
22 |
+
print(self.model)
|
23 |
return self.model.predict(preptext)
|
24 |
|
25 |
def decode(self, pred):
|
pipeline/model/__pycache__/__init__.cpython-311.pyc
CHANGED
Binary files a/pipeline/model/__pycache__/__init__.cpython-311.pyc and b/pipeline/model/__pycache__/__init__.cpython-311.pyc differ
|
|
pipeline/model/model_with_bert_multilingual.py
CHANGED
@@ -7,4 +7,28 @@ from tensorflow.keras.regularizers import l1_l2 # type: ignore
|
|
7 |
|
8 |
|
9 |
def build_model_multilingual(max_length = 65, layer = 40,dropout = 0.69, l2_lstm = 0.01, learning_rate = 1e-4):
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def build_model_multilingual(max_length = 65, layer = 40,dropout = 0.69, l2_lstm = 0.01, learning_rate = 1e-4):
|
10 |
+
bert = TFBertModel.from_pretrained('bert-base-multilingual-cased')
|
11 |
+
# Model definition inside the loop
|
12 |
+
input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
|
13 |
+
bert_output = bert(input_ids)[0] # type: ignore
|
14 |
+
|
15 |
+
bi_lstm_emotion = Bidirectional(LSTM(layer, dropout=dropout, kernel_regularizer=l1_l2(l2_lstm*0.15,l2_lstm)))(bert_output)
|
16 |
+
bi_lstm_toxicity = Bidirectional(LSTM(layer, dropout=dropout, kernel_regularizer=l1_l2(l2_lstm*0.2,l2_lstm)))(bert_output) # outputs
|
17 |
+
|
18 |
+
output_emotion = Dense(6, activation='softmax', name='emotion_output')(bi_lstm_emotion)
|
19 |
+
output_toxicity = Dense(7, activation='softmax', name='toxicity_output')(bi_lstm_toxicity)
|
20 |
+
|
21 |
+
|
22 |
+
model = Model(inputs=input_ids, outputs=[output_emotion, output_toxicity])
|
23 |
+
# # Compile
|
24 |
+
# model = create_multitask_model_with_bert(y_toxicity, y_emotion, TFBertModel, max_length, lstm_dropout=0.2, layers=lstm_layers)
|
25 |
+
model.compile(
|
26 |
+
optimizer=Adam(learning_rate=learning_rate),
|
27 |
+
loss={'emotion_output': 'categorical_crossentropy', 'toxicity_output': 'categorical_crossentropy'},
|
28 |
+
metrics={
|
29 |
+
'emotion_output': ['accuracy', tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.AUC(name='em_auc', multi_label=True), tf.keras.metrics.F1Score(name='f1_score')], # type: ignore
|
30 |
+
'toxicity_output': ['accuracy', tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.AUC(name='to_auc', multi_label=True), tf.keras.metrics.F1Score(name='f1_score')], # type: ignore
|
31 |
+
}
|
32 |
+
)
|
33 |
+
|
34 |
+
return model
|
pipeline/preprocessing/__init__.py
CHANGED
@@ -1,30 +1,3 @@
|
|
1 |
-
from
|
2 |
-
|
3 |
-
|
4 |
-
def __init__(self):
|
5 |
-
import pickle
|
6 |
-
with open('pipeline/preprocessing/encoder_toxicity.pkl', 'rb') as f:
|
7 |
-
self.__encoder_toxicity = pickle.load(f)
|
8 |
-
with open('pipeline/preprocessing/encoder_emotion.pkl', 'rb') as f:
|
9 |
-
self.__encoder_emotion = pickle.load(f)
|
10 |
-
|
11 |
-
# Decoding one-hot encoded labels
|
12 |
-
def toxicity(self,pred):
|
13 |
-
return self.__encoder_toxicity.inverse_transform(pred)
|
14 |
-
|
15 |
-
def emotion(self,pred):
|
16 |
-
return self.__encoder_emotion.inverse_transform(pred)
|
17 |
-
|
18 |
-
class Preprocessor:
|
19 |
-
"""A class used to represent a Preprocessor, which preprocesses text data for the model"""
|
20 |
-
def __init__(self, is_multilingual = False):
|
21 |
-
if is_multilingual:
|
22 |
-
self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
23 |
-
else:
|
24 |
-
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
25 |
-
self.decoder = Decoder()
|
26 |
-
"""Added a decoder object to the Preprocessor class to decode the one-hot encoded labels"""
|
27 |
-
|
28 |
-
def preprocess_text(self,text):
|
29 |
-
return self.tokenizer.encode(text,add_special_tokens=True, max_length=65,
|
30 |
-
padding="max_length", truncation=True, return_attention_mask=False, return_tensors='tf')
|
|
|
1 |
+
from .constants import EMOTION_LABELS, TOXICITY_LABELS
|
2 |
+
from .decoder import Decoder
|
3 |
+
from .tokenizer import Preprocessor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pipeline/preprocessing/__pycache__/__init__.cpython-311.pyc
CHANGED
Binary files a/pipeline/preprocessing/__pycache__/__init__.cpython-311.pyc and b/pipeline/preprocessing/__pycache__/__init__.cpython-311.pyc differ
|
|
pipeline/preprocessing/constants.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
EMOTION_LABELS = (
|
2 |
+
"Angry",
|
3 |
+
"Disgust",
|
4 |
+
"Happy",
|
5 |
+
"Neutral",
|
6 |
+
"Sad",
|
7 |
+
"Surprise",
|
8 |
+
)
|
9 |
+
|
10 |
+
TOXICITY_LABELS = (
|
11 |
+
"Blaming Others",
|
12 |
+
"Cyberbullying",
|
13 |
+
"Gameplay Experience Complaints",
|
14 |
+
"Gamesplaining",
|
15 |
+
"Multiple Discrimination",
|
16 |
+
"Not Toxic",
|
17 |
+
"Sarcasm",
|
18 |
+
)
|
pipeline/preprocessing/decoder.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Decoder:
|
2 |
+
def __init__(self):
|
3 |
+
import pickle
|
4 |
+
with open('pipeline/preprocessing/encoder_toxicity.pkl', 'rb') as v:
|
5 |
+
self.__encoder_toxicity = pickle.load(v)
|
6 |
+
with open('pipeline/preprocessing/encoder_emotion.pkl', 'rb') as v:
|
7 |
+
self.__encoder_emotion = pickle.load(v)
|
8 |
+
|
9 |
+
# Decoding one-hot encoded labels
|
10 |
+
def toxicity(self,pred):
|
11 |
+
return self.__encoder_toxicity.inverse_transform(pred)
|
12 |
+
|
13 |
+
def emotion(self,pred):
|
14 |
+
return self.__encoder_emotion.inverse_transform(pred)
|
pipeline/preprocessing/tokenizer.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import BertTokenizer
|
2 |
+
from .decoder import Decoder
|
3 |
+
class Preprocessor:
|
4 |
+
"""A class used to represent a Preprocessor, which preprocesses text data for the model"""
|
5 |
+
def __init__(self, is_multilingual = False):
|
6 |
+
if is_multilingual:
|
7 |
+
self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
8 |
+
else:
|
9 |
+
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
10 |
+
self.decoder = Decoder()
|
11 |
+
"""Added a decoder object to the Preprocessor class to decode the one-hot encoded labels"""
|
12 |
+
|
13 |
+
def preprocess_text(self,text):
|
14 |
+
return self.tokenizer.encode(text,add_special_tokens=True, max_length=65,
|
15 |
+
padding="max_length", truncation=True, return_attention_mask=False, return_tensors='tf')
|