cspocketindia commited on
Commit
01f65eb
0 Parent(s):

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ /app.py
2
+ /Bert_GoEmotions_4Epochs.model
3
+ /curl_gradio.bat
4
+ /Dockerfile
5
+ /flagged
6
+ /run_gradio_client.py
7
+ /streamlit_app.py
emotion_mapping_finalized.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "joy": ["joy", "amusement", "excitement"],
3
+ "desire": ["desire"],
4
+ "pride": ["pride", "admiration", "relief"],
5
+ "agreement": ["approval", "realization"],
6
+ "surprise": ["surprise", "curiosity"],
7
+ "love": ["love", "caring"],
8
+ "confusion": ["confusion"],
9
+ "anger": ["anger", "disapproval"],
10
+ "disgust": ["disgust", "annoyance"],
11
+ "sadness": ["sadness", "grief", "remorse", "embarrassment"],
12
+ "fear": ["fear", "nervousness"],
13
+ "optimism": ["optimism", "gratitude"],
14
+ "disappointment": ["disappointment"],
15
+ "neutral": ["neutral"]
16
+ }
gradio_app.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio
2
+
3
+ from src.models.bert import BERTClassifier
4
+ from src.utils.utilities import Utility
5
+
6
+ model = BERTClassifier(model_name='Bert_GoEmotions_4Epochs.model')
7
+
8
+ classes = Utility().read_emotion_list()
9
+
10
+ def predict(sentence):
11
+ print(sentence)
12
+ predictions = model.evaluate([sentence])
13
+ print(f"Predictions: {predictions}")
14
+ return classes[predictions[0]]
15
+
16
+ gradio.Interface(
17
+ fn=predict,
18
+ inputs="text",
19
+ outputs="text",
20
+ allow_flagging='auto',
21
+ flagging_dir='logs',
22
+ flagging_callback=gradio.SimpleCSVLogger(),
23
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pyarrow
2
+ # plotly
3
+ # nbformat
4
+ # gensim
5
+ # keras
6
+ pandas
7
+ seaborn
8
+ nltk
9
+ wordcloud
10
+ tensorflow
11
+ tensorflow_hub
12
+ transformers
13
+ flask
14
+ torch
15
+ torchvision
16
+ scikit-learn
17
+ numpy
src/data_loader/go_emotions.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import pickle
4
+ import numpy as np
5
+ from PIL import Image, ImageDraw, ImageFont
6
+ from ..utils.utilities import Utility
7
+
8
+
9
+ class GoEmotionsDataset(torch.utils.data.Dataset):
10
+ util = Utility()
11
+ def __init__(self, embeddings, labels):
12
+ self.labels = labels
13
+ self.instances = embeddings
14
+
15
+ def __len__(self):
16
+ return self.instances.shape[0]
17
+
18
+ def __getitem__(self, idx):
19
+ return self.instances[idx], self.labels[idx]
src/models/__pycache__/bert.cpython-39.pyc ADDED
Binary file (5.74 kB). View file
 
src/models/bert.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import datetime
3
+ import torch
4
+ import numpy as np
5
+ import tqdm
6
+ import random
7
+ from torch import nn
8
+ from transformers import RobertaTokenizer, RobertaModel, AdamW, RobertaConfig
9
+ from sklearn.model_selection import train_test_split
10
+
11
+ from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
12
+ from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
13
+
14
+
15
+ class BERTClassifier():
16
+
17
+
18
+ def __init__(self, model_name="bert-base-uncased", tokenizer_name="bert-base-uncased") -> None:
19
+ print(f'Loading BERT tokenizer:{model_name}...')
20
+
21
+ self.model_name = model_name
22
+
23
+ self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
24
+
25
+ if model_name.endswith('.model'):
26
+ self.model = torch.load(model_name)
27
+ torch.save(self.model.cpu(), model_name)
28
+ else:
29
+ self.model = BertForSequenceClassification.from_pretrained(
30
+ self.model_name,
31
+ num_labels=14,
32
+ output_attentions=False,
33
+ output_hidden_states=False
34
+ )
35
+
36
+ self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
37
+
38
+ self.model.to(self.device)
39
+
40
+ def tokenizeText(self, sentence: str):
41
+ # return self.tokenizer.encode(sentence, add_special_tokens=True)
42
+ encoded_dict = self.tokenizer.encode_plus(
43
+ sentence,
44
+ add_special_tokens=True,
45
+ max_length=64,
46
+ pad_to_max_length=True,
47
+ return_attention_mask=True,
48
+ return_tensors='pt')
49
+ return encoded_dict['input_ids'], encoded_dict['attention_mask']
50
+
51
+ def tokenizeSentences(self, sentences: list, labels: list):
52
+ input_ids = []
53
+ attention_masks = []
54
+ for sent in sentences:
55
+ input_id, attention_mask = self.tokenizeText(sent)
56
+ input_ids.append(input_id)
57
+ attention_masks.append(attention_mask)
58
+
59
+ input_ids = torch.cat(input_ids, dim=0)
60
+ attention_masks = torch.cat(attention_masks, dim=0)
61
+
62
+ dataset = TensorDataset(input_ids, attention_masks, labels)
63
+
64
+ train_size = int(0.9 * len(dataset))
65
+ val_size = len(dataset) - train_size
66
+ return random_split(dataset, [train_size, val_size])
67
+
68
+ def flat_accuracy(self, preds, labels):
69
+ pred_flat = np.argmax(preds, axis=1).flatten()
70
+ labels_flat = labels.flatten()
71
+ return np.sum(pred_flat == labels_flat) / len(labels_flat)
72
+
73
+ def format_time(self, elapsed):
74
+ # Round to the nearest second.
75
+ elapsed_rounded = int(round((elapsed)))
76
+
77
+ # Format as hh:mm:ss
78
+ return str(datetime.timedelta(seconds=elapsed_rounded))
79
+
80
+ def trainModel(self, sentences: list, labels: list, epochs=4, batch_size=32):
81
+ optimizer = AdamW(self.model.parameters(), lr=2e-5, eps=1e-8)
82
+
83
+ train_dataset, val_dataset = self.tokenizeSentences(sentences, labels)
84
+
85
+ train_dataloader = DataLoader(
86
+ train_dataset,
87
+ sampler=RandomSampler(train_dataset),
88
+ batch_size=batch_size
89
+ )
90
+
91
+ validation_dataloader = DataLoader(
92
+ val_dataset,
93
+ sampler=SequentialSampler(val_dataset),
94
+ batch_size=batch_size
95
+ )
96
+
97
+ total_steps = len(train_dataloader) * epochs
98
+
99
+ # Create the learning rate scheduler.
100
+ scheduler = get_linear_schedule_with_warmup(optimizer,
101
+ num_warmup_steps=0, # Default value in run_glue.py
102
+ num_training_steps=total_steps)
103
+
104
+ self.train(train_dataloader, optimizer, scheduler, epochs)
105
+ torch.save(self.model, f"Bert_GoEmotions_BS{batch_size}_E{epochs}.model")
106
+
107
+
108
+ def train(self, train_dataloader, optimizer, scheduler, epochs):
109
+ # This training code is based on the `run_glue.py` script here:
110
+ # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
111
+
112
+ # Measure the total training time for the whole run.
113
+ total_t0 = time.time()
114
+
115
+ # For each epoch...
116
+ for epoch_i in range(epochs):
117
+ print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
118
+ print('Training...')
119
+
120
+ # Measure how long the training epoch takes.
121
+ t0 = time.time()
122
+
123
+ # Reset the total loss for this epoch.
124
+ total_train_loss = 0
125
+
126
+ # Put the model into training mode. Don't be mislead--the call to
127
+ # `train` just changes the *mode*, it doesn't *perform* the training.
128
+ # `dropout` and `batchnorm` layers behave differently during training
129
+ # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
130
+ self.model.train()
131
+
132
+ # For each batch of training data...
133
+ for step, batch in enumerate(train_dataloader):
134
+
135
+ # Progress update every 40 batches.
136
+ if step % 40 == 0 and step != 0:
137
+ # Calculate elapsed time in minutes.
138
+ elapsed = self.format_time(time.time() - t0)
139
+
140
+ # Report progress.
141
+ print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
142
+
143
+ # Unpack this training batch from our dataloader.
144
+ #
145
+ # As we unpack the batch, we'll also copy each tensor to the GPU using the
146
+ # `to` method.
147
+ #
148
+ # `batch` contains three pytorch tensors:
149
+ # [0]: input ids
150
+ # [1]: attention masks
151
+ # [2]: labels
152
+ b_input_ids = batch[0].to(self.device)
153
+ b_input_mask = batch[1].to(self.device)
154
+ b_labels = batch[2].to(self.device)
155
+
156
+ # Always clear any previously calculated gradients before performing a
157
+ # backward pass. PyTorch doesn't do this automatically because
158
+ # accumulating the gradients is "convenient while training RNNs".
159
+ # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
160
+ self.model.zero_grad()
161
+
162
+ # Perform a forward pass (evaluate the model on this training batch).
163
+ # The documentation for this `model` function is here:
164
+ # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
165
+ # It returns different numbers of parameters depending on what arguments
166
+ # arge given and what flags are set. For our useage here, it returns
167
+ # the loss (because we provided labels) and the "logits"--the model
168
+ # outputs prior to activation.
169
+
170
+ output = self.model(b_input_ids,
171
+ token_type_ids=None,
172
+ attention_mask=b_input_mask,
173
+ labels=b_labels)
174
+
175
+
176
+ loss = output.loss
177
+ logits = output.logits
178
+
179
+ # Accumulate the training loss over all of the batches so that we can
180
+ # calculate the average loss at the end. `loss` is a Tensor containing a
181
+ # single value; the `.item()` function just returns the Python value
182
+ # from the tensor.
183
+ total_train_loss += loss.item()
184
+
185
+ # Perform a backward pass to calculate the gradients.
186
+ loss.backward()
187
+
188
+ # Clip the norm of the gradients to 1.0.
189
+ # This is to help prevent the "exploding gradients" problem.
190
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
191
+
192
+ # Update parameters and take a step using the computed gradient.
193
+ # The optimizer dictates the "update rule"--how the parameters are
194
+ # modified based on their gradients, the learning rate, etc.
195
+ optimizer.step()
196
+
197
+ # Update the learning rate.
198
+ scheduler.step()
199
+
200
+ # Calculate the average loss over all of the batches.
201
+ avg_train_loss = total_train_loss / len(train_dataloader)
202
+
203
+ # Measure how long this epoch took.
204
+ training_time = self.format_time(time.time() - t0)
205
+
206
+ print("")
207
+ print(" Average training loss: {0:.2f}".format(avg_train_loss))
208
+ print(" Training epoch took: {:}".format(training_time))
209
+
210
+ print("")
211
+ print("Training complete!")
212
+
213
+ print("Total training took {:} (h:mm:ss)".format(self.format_time(time.time()-total_t0)))
214
+
215
+ def evaluate(self, sentences:list):
216
+ input_ids = []
217
+ attention_masks = []
218
+
219
+ for sent in sentences:
220
+ input_id, attention_mask = self.tokenizeText(sent)
221
+ input_ids.append(input_id)
222
+ attention_masks.append(attention_mask)
223
+
224
+ input_ids = torch.cat(input_ids, dim=0)
225
+ attention_masks = torch.cat(attention_masks, dim=0)
226
+ labels = torch.zeros(len(sentences))
227
+
228
+ batch_size = 32
229
+
230
+ prediction_data = TensorDataset(input_ids, attention_masks, labels)
231
+ prediction_sampler = SequentialSampler(prediction_data)
232
+ prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
233
+
234
+ self.model.eval()
235
+
236
+ predictions = []
237
+
238
+ for batch in prediction_dataloader:
239
+ batch = tuple(t.to(self.device) for t in batch)
240
+
241
+ b_input_ids, b_input_mask, _ = batch
242
+
243
+ with torch.no_grad():
244
+ outputs = self.model(b_input_ids, token_type_ids=None,
245
+ attention_mask=b_input_mask)
246
+
247
+ logits = outputs[0]
248
+
249
+ logits = logits.detach().cpu().numpy()
250
+ predictions.append(logits)
251
+
252
+ # print(predictions)
253
+ return [predictions[0][i].argmax() for i, x in enumerate(sentences)]
254
+
255
+
256
+
src/utils/__pycache__/utilities.cpython-39.pyc ADDED
Binary file (689 Bytes). View file
 
src/utils/utilities.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ class Utility:
4
+
5
+ def read_emotion_list(self):
6
+ with open('./emotion_mapping_finalized.json') as emo_mapping_file:
7
+ finalized_emotions = json.load(emo_mapping_file)
8
+ emotions_mapping = {}
9
+ for key, values in finalized_emotions.items():
10
+ for emotion in values:
11
+ emotions_mapping[emotion] = key
12
+ return list(finalized_emotions.keys())
src/views/index.html ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <body>
4
+ <h1>Sentiment Prediction</h1>
5
+ <form action = "/predict" method = "POST">
6
+ <input type = "text" name = "text" /></p>
7
+ <p><input type = "submit" value = "PREDICT" /></p>
8
+ </form>
9
+ </body>
10
+ </html>