Spaces:
Runtime error
Runtime error
cspocketindia
commited on
Commit
•
01f65eb
0
Parent(s):
first commit
Browse files- .gitignore +7 -0
- emotion_mapping_finalized.json +16 -0
- gradio_app.py +23 -0
- requirements.txt +17 -0
- src/data_loader/go_emotions.py +19 -0
- src/models/__pycache__/bert.cpython-39.pyc +0 -0
- src/models/bert.py +256 -0
- src/utils/__pycache__/utilities.cpython-39.pyc +0 -0
- src/utils/utilities.py +12 -0
- src/views/index.html +10 -0
.gitignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/app.py
|
2 |
+
/Bert_GoEmotions_4Epochs.model
|
3 |
+
/curl_gradio.bat
|
4 |
+
/Dockerfile
|
5 |
+
/flagged
|
6 |
+
/run_gradio_client.py
|
7 |
+
/streamlit_app.py
|
emotion_mapping_finalized.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"joy": ["joy", "amusement", "excitement"],
|
3 |
+
"desire": ["desire"],
|
4 |
+
"pride": ["pride", "admiration", "relief"],
|
5 |
+
"agreement": ["approval", "realization"],
|
6 |
+
"surprise": ["surprise", "curiosity"],
|
7 |
+
"love": ["love", "caring"],
|
8 |
+
"confusion": ["confusion"],
|
9 |
+
"anger": ["anger", "disapproval"],
|
10 |
+
"disgust": ["disgust", "annoyance"],
|
11 |
+
"sadness": ["sadness", "grief", "remorse", "embarrassment"],
|
12 |
+
"fear": ["fear", "nervousness"],
|
13 |
+
"optimism": ["optimism", "gratitude"],
|
14 |
+
"disappointment": ["disappointment"],
|
15 |
+
"neutral": ["neutral"]
|
16 |
+
}
|
gradio_app.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio
|
2 |
+
|
3 |
+
from src.models.bert import BERTClassifier
|
4 |
+
from src.utils.utilities import Utility
|
5 |
+
|
6 |
+
model = BERTClassifier(model_name='Bert_GoEmotions_4Epochs.model')
|
7 |
+
|
8 |
+
classes = Utility().read_emotion_list()
|
9 |
+
|
10 |
+
def predict(sentence):
|
11 |
+
print(sentence)
|
12 |
+
predictions = model.evaluate([sentence])
|
13 |
+
print(f"Predictions: {predictions}")
|
14 |
+
return classes[predictions[0]]
|
15 |
+
|
16 |
+
gradio.Interface(
|
17 |
+
fn=predict,
|
18 |
+
inputs="text",
|
19 |
+
outputs="text",
|
20 |
+
allow_flagging='auto',
|
21 |
+
flagging_dir='logs',
|
22 |
+
flagging_callback=gradio.SimpleCSVLogger(),
|
23 |
+
).launch()
|
requirements.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# pyarrow
|
2 |
+
# plotly
|
3 |
+
# nbformat
|
4 |
+
# gensim
|
5 |
+
# keras
|
6 |
+
pandas
|
7 |
+
seaborn
|
8 |
+
nltk
|
9 |
+
wordcloud
|
10 |
+
tensorflow
|
11 |
+
tensorflow_hub
|
12 |
+
transformers
|
13 |
+
flask
|
14 |
+
torch
|
15 |
+
torchvision
|
16 |
+
scikit-learn
|
17 |
+
numpy
|
src/data_loader/go_emotions.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import pickle
|
4 |
+
import numpy as np
|
5 |
+
from PIL import Image, ImageDraw, ImageFont
|
6 |
+
from ..utils.utilities import Utility
|
7 |
+
|
8 |
+
|
9 |
+
class GoEmotionsDataset(torch.utils.data.Dataset):
|
10 |
+
util = Utility()
|
11 |
+
def __init__(self, embeddings, labels):
|
12 |
+
self.labels = labels
|
13 |
+
self.instances = embeddings
|
14 |
+
|
15 |
+
def __len__(self):
|
16 |
+
return self.instances.shape[0]
|
17 |
+
|
18 |
+
def __getitem__(self, idx):
|
19 |
+
return self.instances[idx], self.labels[idx]
|
src/models/__pycache__/bert.cpython-39.pyc
ADDED
Binary file (5.74 kB). View file
|
|
src/models/bert.py
ADDED
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import datetime
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import tqdm
|
6 |
+
import random
|
7 |
+
from torch import nn
|
8 |
+
from transformers import RobertaTokenizer, RobertaModel, AdamW, RobertaConfig
|
9 |
+
from sklearn.model_selection import train_test_split
|
10 |
+
|
11 |
+
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
|
12 |
+
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
|
13 |
+
|
14 |
+
|
15 |
+
class BERTClassifier():
|
16 |
+
|
17 |
+
|
18 |
+
def __init__(self, model_name="bert-base-uncased", tokenizer_name="bert-base-uncased") -> None:
|
19 |
+
print(f'Loading BERT tokenizer:{model_name}...')
|
20 |
+
|
21 |
+
self.model_name = model_name
|
22 |
+
|
23 |
+
self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
|
24 |
+
|
25 |
+
if model_name.endswith('.model'):
|
26 |
+
self.model = torch.load(model_name)
|
27 |
+
torch.save(self.model.cpu(), model_name)
|
28 |
+
else:
|
29 |
+
self.model = BertForSequenceClassification.from_pretrained(
|
30 |
+
self.model_name,
|
31 |
+
num_labels=14,
|
32 |
+
output_attentions=False,
|
33 |
+
output_hidden_states=False
|
34 |
+
)
|
35 |
+
|
36 |
+
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
37 |
+
|
38 |
+
self.model.to(self.device)
|
39 |
+
|
40 |
+
def tokenizeText(self, sentence: str):
|
41 |
+
# return self.tokenizer.encode(sentence, add_special_tokens=True)
|
42 |
+
encoded_dict = self.tokenizer.encode_plus(
|
43 |
+
sentence,
|
44 |
+
add_special_tokens=True,
|
45 |
+
max_length=64,
|
46 |
+
pad_to_max_length=True,
|
47 |
+
return_attention_mask=True,
|
48 |
+
return_tensors='pt')
|
49 |
+
return encoded_dict['input_ids'], encoded_dict['attention_mask']
|
50 |
+
|
51 |
+
def tokenizeSentences(self, sentences: list, labels: list):
|
52 |
+
input_ids = []
|
53 |
+
attention_masks = []
|
54 |
+
for sent in sentences:
|
55 |
+
input_id, attention_mask = self.tokenizeText(sent)
|
56 |
+
input_ids.append(input_id)
|
57 |
+
attention_masks.append(attention_mask)
|
58 |
+
|
59 |
+
input_ids = torch.cat(input_ids, dim=0)
|
60 |
+
attention_masks = torch.cat(attention_masks, dim=0)
|
61 |
+
|
62 |
+
dataset = TensorDataset(input_ids, attention_masks, labels)
|
63 |
+
|
64 |
+
train_size = int(0.9 * len(dataset))
|
65 |
+
val_size = len(dataset) - train_size
|
66 |
+
return random_split(dataset, [train_size, val_size])
|
67 |
+
|
68 |
+
def flat_accuracy(self, preds, labels):
|
69 |
+
pred_flat = np.argmax(preds, axis=1).flatten()
|
70 |
+
labels_flat = labels.flatten()
|
71 |
+
return np.sum(pred_flat == labels_flat) / len(labels_flat)
|
72 |
+
|
73 |
+
def format_time(self, elapsed):
|
74 |
+
# Round to the nearest second.
|
75 |
+
elapsed_rounded = int(round((elapsed)))
|
76 |
+
|
77 |
+
# Format as hh:mm:ss
|
78 |
+
return str(datetime.timedelta(seconds=elapsed_rounded))
|
79 |
+
|
80 |
+
def trainModel(self, sentences: list, labels: list, epochs=4, batch_size=32):
|
81 |
+
optimizer = AdamW(self.model.parameters(), lr=2e-5, eps=1e-8)
|
82 |
+
|
83 |
+
train_dataset, val_dataset = self.tokenizeSentences(sentences, labels)
|
84 |
+
|
85 |
+
train_dataloader = DataLoader(
|
86 |
+
train_dataset,
|
87 |
+
sampler=RandomSampler(train_dataset),
|
88 |
+
batch_size=batch_size
|
89 |
+
)
|
90 |
+
|
91 |
+
validation_dataloader = DataLoader(
|
92 |
+
val_dataset,
|
93 |
+
sampler=SequentialSampler(val_dataset),
|
94 |
+
batch_size=batch_size
|
95 |
+
)
|
96 |
+
|
97 |
+
total_steps = len(train_dataloader) * epochs
|
98 |
+
|
99 |
+
# Create the learning rate scheduler.
|
100 |
+
scheduler = get_linear_schedule_with_warmup(optimizer,
|
101 |
+
num_warmup_steps=0, # Default value in run_glue.py
|
102 |
+
num_training_steps=total_steps)
|
103 |
+
|
104 |
+
self.train(train_dataloader, optimizer, scheduler, epochs)
|
105 |
+
torch.save(self.model, f"Bert_GoEmotions_BS{batch_size}_E{epochs}.model")
|
106 |
+
|
107 |
+
|
108 |
+
def train(self, train_dataloader, optimizer, scheduler, epochs):
|
109 |
+
# This training code is based on the `run_glue.py` script here:
|
110 |
+
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
|
111 |
+
|
112 |
+
# Measure the total training time for the whole run.
|
113 |
+
total_t0 = time.time()
|
114 |
+
|
115 |
+
# For each epoch...
|
116 |
+
for epoch_i in range(epochs):
|
117 |
+
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
|
118 |
+
print('Training...')
|
119 |
+
|
120 |
+
# Measure how long the training epoch takes.
|
121 |
+
t0 = time.time()
|
122 |
+
|
123 |
+
# Reset the total loss for this epoch.
|
124 |
+
total_train_loss = 0
|
125 |
+
|
126 |
+
# Put the model into training mode. Don't be mislead--the call to
|
127 |
+
# `train` just changes the *mode*, it doesn't *perform* the training.
|
128 |
+
# `dropout` and `batchnorm` layers behave differently during training
|
129 |
+
# vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
|
130 |
+
self.model.train()
|
131 |
+
|
132 |
+
# For each batch of training data...
|
133 |
+
for step, batch in enumerate(train_dataloader):
|
134 |
+
|
135 |
+
# Progress update every 40 batches.
|
136 |
+
if step % 40 == 0 and step != 0:
|
137 |
+
# Calculate elapsed time in minutes.
|
138 |
+
elapsed = self.format_time(time.time() - t0)
|
139 |
+
|
140 |
+
# Report progress.
|
141 |
+
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
|
142 |
+
|
143 |
+
# Unpack this training batch from our dataloader.
|
144 |
+
#
|
145 |
+
# As we unpack the batch, we'll also copy each tensor to the GPU using the
|
146 |
+
# `to` method.
|
147 |
+
#
|
148 |
+
# `batch` contains three pytorch tensors:
|
149 |
+
# [0]: input ids
|
150 |
+
# [1]: attention masks
|
151 |
+
# [2]: labels
|
152 |
+
b_input_ids = batch[0].to(self.device)
|
153 |
+
b_input_mask = batch[1].to(self.device)
|
154 |
+
b_labels = batch[2].to(self.device)
|
155 |
+
|
156 |
+
# Always clear any previously calculated gradients before performing a
|
157 |
+
# backward pass. PyTorch doesn't do this automatically because
|
158 |
+
# accumulating the gradients is "convenient while training RNNs".
|
159 |
+
# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
|
160 |
+
self.model.zero_grad()
|
161 |
+
|
162 |
+
# Perform a forward pass (evaluate the model on this training batch).
|
163 |
+
# The documentation for this `model` function is here:
|
164 |
+
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
|
165 |
+
# It returns different numbers of parameters depending on what arguments
|
166 |
+
# arge given and what flags are set. For our useage here, it returns
|
167 |
+
# the loss (because we provided labels) and the "logits"--the model
|
168 |
+
# outputs prior to activation.
|
169 |
+
|
170 |
+
output = self.model(b_input_ids,
|
171 |
+
token_type_ids=None,
|
172 |
+
attention_mask=b_input_mask,
|
173 |
+
labels=b_labels)
|
174 |
+
|
175 |
+
|
176 |
+
loss = output.loss
|
177 |
+
logits = output.logits
|
178 |
+
|
179 |
+
# Accumulate the training loss over all of the batches so that we can
|
180 |
+
# calculate the average loss at the end. `loss` is a Tensor containing a
|
181 |
+
# single value; the `.item()` function just returns the Python value
|
182 |
+
# from the tensor.
|
183 |
+
total_train_loss += loss.item()
|
184 |
+
|
185 |
+
# Perform a backward pass to calculate the gradients.
|
186 |
+
loss.backward()
|
187 |
+
|
188 |
+
# Clip the norm of the gradients to 1.0.
|
189 |
+
# This is to help prevent the "exploding gradients" problem.
|
190 |
+
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
|
191 |
+
|
192 |
+
# Update parameters and take a step using the computed gradient.
|
193 |
+
# The optimizer dictates the "update rule"--how the parameters are
|
194 |
+
# modified based on their gradients, the learning rate, etc.
|
195 |
+
optimizer.step()
|
196 |
+
|
197 |
+
# Update the learning rate.
|
198 |
+
scheduler.step()
|
199 |
+
|
200 |
+
# Calculate the average loss over all of the batches.
|
201 |
+
avg_train_loss = total_train_loss / len(train_dataloader)
|
202 |
+
|
203 |
+
# Measure how long this epoch took.
|
204 |
+
training_time = self.format_time(time.time() - t0)
|
205 |
+
|
206 |
+
print("")
|
207 |
+
print(" Average training loss: {0:.2f}".format(avg_train_loss))
|
208 |
+
print(" Training epoch took: {:}".format(training_time))
|
209 |
+
|
210 |
+
print("")
|
211 |
+
print("Training complete!")
|
212 |
+
|
213 |
+
print("Total training took {:} (h:mm:ss)".format(self.format_time(time.time()-total_t0)))
|
214 |
+
|
215 |
+
def evaluate(self, sentences:list):
|
216 |
+
input_ids = []
|
217 |
+
attention_masks = []
|
218 |
+
|
219 |
+
for sent in sentences:
|
220 |
+
input_id, attention_mask = self.tokenizeText(sent)
|
221 |
+
input_ids.append(input_id)
|
222 |
+
attention_masks.append(attention_mask)
|
223 |
+
|
224 |
+
input_ids = torch.cat(input_ids, dim=0)
|
225 |
+
attention_masks = torch.cat(attention_masks, dim=0)
|
226 |
+
labels = torch.zeros(len(sentences))
|
227 |
+
|
228 |
+
batch_size = 32
|
229 |
+
|
230 |
+
prediction_data = TensorDataset(input_ids, attention_masks, labels)
|
231 |
+
prediction_sampler = SequentialSampler(prediction_data)
|
232 |
+
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
|
233 |
+
|
234 |
+
self.model.eval()
|
235 |
+
|
236 |
+
predictions = []
|
237 |
+
|
238 |
+
for batch in prediction_dataloader:
|
239 |
+
batch = tuple(t.to(self.device) for t in batch)
|
240 |
+
|
241 |
+
b_input_ids, b_input_mask, _ = batch
|
242 |
+
|
243 |
+
with torch.no_grad():
|
244 |
+
outputs = self.model(b_input_ids, token_type_ids=None,
|
245 |
+
attention_mask=b_input_mask)
|
246 |
+
|
247 |
+
logits = outputs[0]
|
248 |
+
|
249 |
+
logits = logits.detach().cpu().numpy()
|
250 |
+
predictions.append(logits)
|
251 |
+
|
252 |
+
# print(predictions)
|
253 |
+
return [predictions[0][i].argmax() for i, x in enumerate(sentences)]
|
254 |
+
|
255 |
+
|
256 |
+
|
src/utils/__pycache__/utilities.cpython-39.pyc
ADDED
Binary file (689 Bytes). View file
|
|
src/utils/utilities.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
class Utility:
|
4 |
+
|
5 |
+
def read_emotion_list(self):
|
6 |
+
with open('./emotion_mapping_finalized.json') as emo_mapping_file:
|
7 |
+
finalized_emotions = json.load(emo_mapping_file)
|
8 |
+
emotions_mapping = {}
|
9 |
+
for key, values in finalized_emotions.items():
|
10 |
+
for emotion in values:
|
11 |
+
emotions_mapping[emotion] = key
|
12 |
+
return list(finalized_emotions.keys())
|
src/views/index.html
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<body>
|
4 |
+
<h1>Sentiment Prediction</h1>
|
5 |
+
<form action = "/predict" method = "POST">
|
6 |
+
<input type = "text" name = "text" /></p>
|
7 |
+
<p><input type = "submit" value = "PREDICT" /></p>
|
8 |
+
</form>
|
9 |
+
</body>
|
10 |
+
</html>
|