cooldragon12 commited on
Commit
f67c2de
β€’
1 Parent(s): 3d200ef

Added Multilingual BERT Base

Browse files
.gitignore CHANGED
@@ -1,3 +1,2 @@
1
  **/__pycache__/
2
- **venv/
3
- model_with_bert_multilingual.h5
 
1
  **/__pycache__/
2
+ **venv/
 
README.md CHANGED
@@ -1,5 +1,6 @@
1
  ---
2
  title: Multitask Classifying Emotion Toxicity Valorant Chat
 
3
  emoji: πŸ“Š
4
  colorFrom: blue
5
  colorTo: purple
@@ -10,4 +11,6 @@ pinned: false
10
  license: mit
11
  ---
12
 
 
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Multitask Classifying Emotion Toxicity Valorant Chat
3
+ summary: This space contains a Streamlit app that classifies the toxicity and emotion of Valorant chat messages. From the model of ChattyTicket.
4
  emoji: πŸ“Š
5
  colorFrom: blue
6
  colorTo: purple
 
11
  license: mit
12
  ---
13
 
14
+
15
+
16
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,66 +1,71 @@
1
  import streamlit as st
2
-
3
  from pipeline.model import MultiTaskModel
4
- from pipeline.preprocessing import Preprocessor
5
 
6
  # Load the model
7
- EMOTION_CHOICES = (
8
- "Angry",
9
- "Disgust",
10
- "Happy",
11
- "Neutral",
12
- "Sad",
13
- "Surprise",
14
- )
15
- TOXICITY_CHOICES = (
16
- "Blaming Others",
17
- "Cyberbullying",
18
- "Gameplay Experience Complaints",
19
- "Gamesplaining",
20
- "Multiple Discrimination",
21
- "Not Toxic",
22
- "Sarcasm",
23
- )
24
 
25
- st.title("Emotion and Toxicity Classification of Valorant chat messages")
26
 
27
- st.write(
28
- 'This is a simple web app that predicts the emotion and toxicity of Valorant chat messages. Enter a message in the text box below and click the "Predict" button to get the prediction.'
 
 
 
 
 
 
 
 
 
 
29
  )
30
 
31
  st.table(
32
  {
33
- "Emotion": EMOTION_CHOICES,
34
- "Toxicity": TOXICITY_CHOICES,
35
  }
36
  )
37
 
38
  @st.cache_resource
39
- def loading_model():
40
- return MultiTaskModel(preprocessor=Preprocessor())
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- model = loading_model()
 
43
 
44
  # Get user input
45
  user_input = st.text_input("Enter a Valorant chat message:")
46
  st.write("You entered:", user_input)
47
-
48
  # Predict
49
  prediction = model.predict(user_input)
50
  emotions, toxicitys = prediction
 
51
 
52
  col1, col2 = st.columns(2)
53
 
54
  with col1:
 
55
  for i, emotion in enumerate(emotions[0]):
56
- st.write(f"{EMOTION_CHOICES[i]}: {(emotion*100):.2f}%")
57
  st.progress(float(emotion))
58
  with col2:
 
59
  for i, toxicity in enumerate(toxicitys[0]):
60
- st.write(f"{TOXICITY_CHOICES[i]}: {(toxicity*100):.2f}%")
61
  st.progress(float(toxicity))
62
 
63
- decoded = model.decode(prediction)
64
- # Display the prediction
65
- st.write("The predicted emotion is:", decoded[0][0])
66
- st.write("The predicted toxicity is:", decoded[1][0])
 
1
  import streamlit as st
 
2
  from pipeline.model import MultiTaskModel
3
+ from pipeline.preprocessing import Preprocessor, EMOTION_LABELS, TOXICITY_LABELS
4
 
5
  # Load the model
6
+ st.title("ChattyTicket Model: Emotion and Toxicity Classification of Valorant chat messages")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ st.write("""Discover our innovative model designed to tackle toxicity in Valorant, a popular multiplayer game. Leveraging advanced multi-task learning, our model combines Bi-LSTM and BERT architectures for superior performance. This approach enables accurate classification of toxic conversations and emotional content within the game.
9
 
10
+ Key Features:
11
+ - Multi-Task Learning: Simultaneous classification of toxicity and emotion.
12
+ - Bi-LSTM & BERT Integration: Enhanced accuracy with a BERT pre-trained backbone.
13
+ - High Accuracy: 91.81% in toxicity detection and 86.74% in emotion prediction.
14
+ Insights & Impact:
15
+ - Emotional Landscape: Identifies prevalent emotions like anger and instances of cyberbullying.
16
+ - Healthier Communities: Provides insights for fostering positive gaming environments.
17
+ ChattyTicket API:
18
+ - Evaluate chat text for emotion and toxicity with our user-friendly API. Positive user feedback highlights its effectiveness and interactivity, with ongoing improvements based on user input to better handle nuances like sarcasm.
19
+
20
+ Experience the future of online gaming moderation with our model and contribute to a healthier, more enjoyable gaming community.
21
+ """
22
  )
23
 
24
  st.table(
25
  {
26
+ "Emotion": EMOTION_LABELS,
27
+ "Toxicity": TOXICITY_LABELS,
28
  }
29
  )
30
 
31
  @st.cache_resource
32
+ def loading_model(bert_base):
33
+ if bert_base == "bert-base-uncased":
34
+ print("Loading base model")
35
+ return MultiTaskModel(preprocessor=Preprocessor())
36
+ elif bert_base == "bert-base-multilingual-cased":
37
+ print("Loading multilingual model")
38
+ return MultiTaskModel(is_multilingual=True, preprocessor=Preprocessor(is_multilingual=True))
39
+ else:
40
+ return None
41
+
42
+ def clear():
43
+ loading_model.clear()
44
+
45
+ bert_base = st.selectbox("Select a model", ("bert-base-uncased", "bert-base-multilingual-cased"), placeholder="Select a model", on_change=clear)
46
 
47
+
48
+ model = loading_model(bert_base)
49
 
50
  # Get user input
51
  user_input = st.text_input("Enter a Valorant chat message:")
52
  st.write("You entered:", user_input)
 
53
  # Predict
54
  prediction = model.predict(user_input)
55
  emotions, toxicitys = prediction
56
+ decoded = model.decode(prediction)
57
 
58
  col1, col2 = st.columns(2)
59
 
60
  with col1:
61
+ st.write(f"The predicted emotion is: {decoded[0][0][0]}" )
62
  for i, emotion in enumerate(emotions[0]):
63
+ st.write(f"{EMOTION_LABELS[i]}: {(emotion*100):.2f}%")
64
  st.progress(float(emotion))
65
  with col2:
66
+ st.write(f"The predicted toxicity is: {decoded[1][0][0]} ")
67
  for i, toxicity in enumerate(toxicitys[0]):
68
+ st.write(f"{TOXICITY_LABELS[i]}: {(toxicity*100):.2f}%")
69
  st.progress(float(toxicity))
70
 
71
+
 
 
 
model_with_bert_multilingual.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50a1ae46552fba31db5adea05b4edc300eb9c22a3932fbbf44e7ebeeaa883774
3
+ size 2141328712
pipeline/model/__init__.py CHANGED
@@ -1,5 +1,3 @@
1
- import tensorflow as tf
2
-
3
  from tensorflow.keras.models import load_model
4
  from transformers import TFBertModel
5
 
@@ -21,6 +19,7 @@ class MultiTaskModel:
21
 
22
  def predict(self, text):
23
  preptext= self.preprocessor.preprocess_text(text)
 
24
  return self.model.predict(preptext)
25
 
26
  def decode(self, pred):
 
 
 
1
  from tensorflow.keras.models import load_model
2
  from transformers import TFBertModel
3
 
 
19
 
20
  def predict(self, text):
21
  preptext= self.preprocessor.preprocess_text(text)
22
+ print(self.model)
23
  return self.model.predict(preptext)
24
 
25
  def decode(self, pred):
pipeline/model/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/pipeline/model/__pycache__/__init__.cpython-311.pyc and b/pipeline/model/__pycache__/__init__.cpython-311.pyc differ
 
pipeline/model/model_with_bert_multilingual.py CHANGED
@@ -7,4 +7,28 @@ from tensorflow.keras.regularizers import l1_l2 # type: ignore
7
 
8
 
9
  def build_model_multilingual(max_length = 65, layer = 40,dropout = 0.69, l2_lstm = 0.01, learning_rate = 1e-4):
10
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  def build_model_multilingual(max_length = 65, layer = 40,dropout = 0.69, l2_lstm = 0.01, learning_rate = 1e-4):
10
+ bert = TFBertModel.from_pretrained('bert-base-multilingual-cased')
11
+ # Model definition inside the loop
12
+ input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
13
+ bert_output = bert(input_ids)[0] # type: ignore
14
+
15
+ bi_lstm_emotion = Bidirectional(LSTM(layer, dropout=dropout, kernel_regularizer=l1_l2(l2_lstm*0.15,l2_lstm)))(bert_output)
16
+ bi_lstm_toxicity = Bidirectional(LSTM(layer, dropout=dropout, kernel_regularizer=l1_l2(l2_lstm*0.2,l2_lstm)))(bert_output) # outputs
17
+
18
+ output_emotion = Dense(6, activation='softmax', name='emotion_output')(bi_lstm_emotion)
19
+ output_toxicity = Dense(7, activation='softmax', name='toxicity_output')(bi_lstm_toxicity)
20
+
21
+
22
+ model = Model(inputs=input_ids, outputs=[output_emotion, output_toxicity])
23
+ # # Compile
24
+ # model = create_multitask_model_with_bert(y_toxicity, y_emotion, TFBertModel, max_length, lstm_dropout=0.2, layers=lstm_layers)
25
+ model.compile(
26
+ optimizer=Adam(learning_rate=learning_rate),
27
+ loss={'emotion_output': 'categorical_crossentropy', 'toxicity_output': 'categorical_crossentropy'},
28
+ metrics={
29
+ 'emotion_output': ['accuracy', tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.AUC(name='em_auc', multi_label=True), tf.keras.metrics.F1Score(name='f1_score')], # type: ignore
30
+ 'toxicity_output': ['accuracy', tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.AUC(name='to_auc', multi_label=True), tf.keras.metrics.F1Score(name='f1_score')], # type: ignore
31
+ }
32
+ )
33
+
34
+ return model
pipeline/preprocessing/__init__.py CHANGED
@@ -1,30 +1,3 @@
1
- from transformers import BertTokenizer
2
-
3
- class Decoder:
4
- def __init__(self):
5
- import pickle
6
- with open('pipeline/preprocessing/encoder_toxicity.pkl', 'rb') as f:
7
- self.__encoder_toxicity = pickle.load(f)
8
- with open('pipeline/preprocessing/encoder_emotion.pkl', 'rb') as f:
9
- self.__encoder_emotion = pickle.load(f)
10
-
11
- # Decoding one-hot encoded labels
12
- def toxicity(self,pred):
13
- return self.__encoder_toxicity.inverse_transform(pred)
14
-
15
- def emotion(self,pred):
16
- return self.__encoder_emotion.inverse_transform(pred)
17
-
18
- class Preprocessor:
19
- """A class used to represent a Preprocessor, which preprocesses text data for the model"""
20
- def __init__(self, is_multilingual = False):
21
- if is_multilingual:
22
- self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
23
- else:
24
- self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
25
- self.decoder = Decoder()
26
- """Added a decoder object to the Preprocessor class to decode the one-hot encoded labels"""
27
-
28
- def preprocess_text(self,text):
29
- return self.tokenizer.encode(text,add_special_tokens=True, max_length=65,
30
- padding="max_length", truncation=True, return_attention_mask=False, return_tensors='tf')
 
1
+ from .constants import EMOTION_LABELS, TOXICITY_LABELS
2
+ from .decoder import Decoder
3
+ from .tokenizer import Preprocessor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pipeline/preprocessing/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/pipeline/preprocessing/__pycache__/__init__.cpython-311.pyc and b/pipeline/preprocessing/__pycache__/__init__.cpython-311.pyc differ
 
pipeline/preprocessing/constants.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EMOTION_LABELS = (
2
+ "Angry",
3
+ "Disgust",
4
+ "Happy",
5
+ "Neutral",
6
+ "Sad",
7
+ "Surprise",
8
+ )
9
+
10
+ TOXICITY_LABELS = (
11
+ "Blaming Others",
12
+ "Cyberbullying",
13
+ "Gameplay Experience Complaints",
14
+ "Gamesplaining",
15
+ "Multiple Discrimination",
16
+ "Not Toxic",
17
+ "Sarcasm",
18
+ )
pipeline/preprocessing/decoder.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Decoder:
2
+ def __init__(self):
3
+ import pickle
4
+ with open('pipeline/preprocessing/encoder_toxicity.pkl', 'rb') as v:
5
+ self.__encoder_toxicity = pickle.load(v)
6
+ with open('pipeline/preprocessing/encoder_emotion.pkl', 'rb') as v:
7
+ self.__encoder_emotion = pickle.load(v)
8
+
9
+ # Decoding one-hot encoded labels
10
+ def toxicity(self,pred):
11
+ return self.__encoder_toxicity.inverse_transform(pred)
12
+
13
+ def emotion(self,pred):
14
+ return self.__encoder_emotion.inverse_transform(pred)
pipeline/preprocessing/tokenizer.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer
2
+ from .decoder import Decoder
3
+ class Preprocessor:
4
+ """A class used to represent a Preprocessor, which preprocesses text data for the model"""
5
+ def __init__(self, is_multilingual = False):
6
+ if is_multilingual:
7
+ self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
8
+ else:
9
+ self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
10
+ self.decoder = Decoder()
11
+ """Added a decoder object to the Preprocessor class to decode the one-hot encoded labels"""
12
+
13
+ def preprocess_text(self,text):
14
+ return self.tokenizer.encode(text,add_special_tokens=True, max_length=65,
15
+ padding="max_length", truncation=True, return_attention_mask=False, return_tensors='tf')