Azhageswari commited on
Commit
8eed319
β€’
1 Parent(s): 9619abc

Upload 6 files

Browse files
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: Nlp Goemotion Sentimentanalysis
3
- emoji: πŸ‘€
4
- colorFrom: pink
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 4.11.0
8
- app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: NLP Sentiment Prediction with GoEmotions
3
+ emoji: πŸƒ
4
+ colorFrom: gray
5
+ colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 3.29.0
8
+ app_file: gradio_app.py
9
  pinned: false
10
  license: apache-2.0
11
+ python_version: 3.9.0
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
emotion_mapping_finalized.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "joy": ["joy", "amusement", "excitement"],
3
+ "desire": ["desire"],
4
+ "pride": ["pride", "admiration", "relief"],
5
+ "agreement": ["approval", "realization"],
6
+ "surprise": ["surprise", "curiosity"],
7
+ "love": ["love", "caring"],
8
+ "confusion": ["confusion"],
9
+ "anger": ["anger", "disapproval"],
10
+ "disgust": ["disgust", "annoyance"],
11
+ "sadness": ["sadness", "grief", "remorse", "embarrassment"],
12
+ "fear": ["fear", "nervousness"],
13
+ "optimism": ["optimism", "gratitude"],
14
+ "disappointment": ["disappointment"],
15
+ "neutral": ["neutral"]
16
+ }
gradio_app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import csv
4
+ import datetime
5
+ import gradio
6
+ import schedule
7
+ from gradio import utils
8
+ import huggingface_hub
9
+ from pathlib import Path
10
+ from src.models.bert import BERTClassifier
11
+ from src.utils.utilities import Utility
12
+
13
+ model = BERTClassifier(model_name='jeevavijay10/nlp-goemotions-bert')
14
+
15
+ classes = Utility().read_emotion_list()
16
+
17
+ hf_token = os.getenv("HF_TOKEN")
18
+
19
+ dataset_dir = "logs"
20
+
21
+ headers = ["input", "output", "timestamp", "elapsed"]
22
+
23
+
24
+ repo = huggingface_hub.Repository(
25
+ local_dir=dataset_dir,
26
+ clone_from="https://huggingface.co/datasets/jeevavijay10/senti-pred-gradio",
27
+ token=hf_token,
28
+ )
29
+ repo.git_pull(lfs=True)
30
+
31
+ def log_record(vals):
32
+ log_file = Path(dataset_dir) / "data.csv"
33
+ is_new = not Path(log_file).exists()
34
+ with open(log_file, "a", newline="", encoding="utf-8") as csvfile:
35
+ writer = csv.writer(csvfile)
36
+ if is_new:
37
+ writer.writerow(utils.sanitize_list_for_csv(headers))
38
+ writer.writerow(utils.sanitize_list_for_csv(vals))
39
+ schedule.run_pending()
40
+ print(f"Last Sync: {job.last_run}")
41
+
42
+ def predict(sentence):
43
+
44
+ timestamp = datetime.datetime.now().isoformat()
45
+ start_time = time.time()
46
+ predictions = model.evaluate([sentence])
47
+ elapsed_time = time.time() - start_time
48
+
49
+ output = classes[predictions[0]]
50
+
51
+ print(f"Sentence: {sentence} \nPrediction: {predictions[0]} - {output}")
52
+ log_record([sentence, output, timestamp, str(elapsed_time)])
53
+
54
+ return output
55
+
56
+
57
+ def sync_logs():
58
+ print(f"Repo Clean: {repo.is_repo_clean()}")
59
+ if not repo.is_repo_clean():
60
+ repo.git_add()
61
+ repo.git_commit()
62
+ repo.git_pull(lfs=True)
63
+ result = repo.git_push()
64
+ # result = repo.push_to_hub()
65
+ print(result)
66
+
67
+ job = schedule.every(5).minutes.do(sync_logs)
68
+ print("Scheduler engaged")
69
+
70
+ gradio.Interface(
71
+ fn=predict,
72
+ inputs="text",
73
+ outputs="text",
74
+ allow_flagging='never'
75
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pyarrow
2
+ # plotly
3
+ # nbformat
4
+ # gensim
5
+ # keras
6
+ pandas
7
+ seaborn
8
+ nltk
9
+ wordcloud
10
+ tensorflow
11
+ tensorflow_hub
12
+ transformers
13
+ flask
14
+ torch
15
+ torchvision
16
+ scikit-learn
17
+ numpy
18
+ schedule
src/models/bert.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import datetime
3
+ import torch
4
+ import numpy as np
5
+ import tqdm
6
+ import random
7
+ from torch import nn
8
+ from transformers import RobertaTokenizer, RobertaModel, AdamW, RobertaConfig
9
+ from sklearn.model_selection import train_test_split
10
+
11
+ from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, AutoModel, AutoTokenizer
12
+ from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
13
+
14
+
15
+ class BERTClassifier():
16
+
17
+
18
+ def __init__(self, model_name="bert-base-uncased", tokenizer_name="bert-base-uncased") -> None:
19
+ print(f'Loading BERT:{model_name}...')
20
+
21
+ self.model_name = model_name
22
+
23
+ # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
24
+ self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
25
+
26
+ if model_name.startswith('jeevavijay10'):
27
+ # self.model = torch.load(model_name)
28
+ self.model = BertForSequenceClassification.from_pretrained(model_name)
29
+ else:
30
+ self.model = BertForSequenceClassification.from_pretrained(
31
+ self.model_name,
32
+ num_labels=14,
33
+ output_attentions=False,
34
+ output_hidden_states=False
35
+ )
36
+
37
+ self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
38
+
39
+ self.model.to(self.device)
40
+
41
+ def tokenizeText(self, sentence: str):
42
+ # return self.tokenizer.encode(sentence, add_special_tokens=True)
43
+ encoded_dict = self.tokenizer.encode_plus(
44
+ sentence,
45
+ add_special_tokens=True,
46
+ max_length=64,
47
+ pad_to_max_length=True,
48
+ return_attention_mask=True,
49
+ return_tensors='pt')
50
+ return encoded_dict['input_ids'], encoded_dict['attention_mask']
51
+
52
+ def tokenizeSentences(self, sentences: list, labels: list):
53
+ input_ids = []
54
+ attention_masks = []
55
+ for sent in sentences:
56
+ input_id, attention_mask = self.tokenizeText(sent)
57
+ input_ids.append(input_id)
58
+ attention_masks.append(attention_mask)
59
+
60
+ input_ids = torch.cat(input_ids, dim=0)
61
+ attention_masks = torch.cat(attention_masks, dim=0)
62
+
63
+ dataset = TensorDataset(input_ids, attention_masks, labels)
64
+
65
+ train_size = int(0.9 * len(dataset))
66
+ val_size = len(dataset) - train_size
67
+ return random_split(dataset, [train_size, val_size])
68
+
69
+ def flat_accuracy(self, preds, labels):
70
+ pred_flat = np.argmax(preds, axis=1).flatten()
71
+ labels_flat = labels.flatten()
72
+ return np.sum(pred_flat == labels_flat) / len(labels_flat)
73
+
74
+ def format_time(self, elapsed):
75
+ # Round to the nearest second.
76
+ elapsed_rounded = int(round((elapsed)))
77
+
78
+ # Format as hh:mm:ss
79
+ return str(datetime.timedelta(seconds=elapsed_rounded))
80
+
81
+ def trainModel(self, sentences: list, labels: list, epochs=4, batch_size=32):
82
+ optimizer = AdamW(self.model.parameters(), lr=2e-5, eps=1e-8)
83
+
84
+ train_dataset, val_dataset = self.tokenizeSentences(sentences, labels)
85
+
86
+ train_dataloader = DataLoader(
87
+ train_dataset,
88
+ sampler=RandomSampler(train_dataset),
89
+ batch_size=batch_size
90
+ )
91
+
92
+ validation_dataloader = DataLoader(
93
+ val_dataset,
94
+ sampler=SequentialSampler(val_dataset),
95
+ batch_size=batch_size
96
+ )
97
+
98
+ total_steps = len(train_dataloader) * epochs
99
+
100
+ # Create the learning rate scheduler.
101
+ scheduler = get_linear_schedule_with_warmup(optimizer,
102
+ num_warmup_steps=0, # Default value in run_glue.py
103
+ num_training_steps=total_steps)
104
+
105
+ self.train(train_dataloader, optimizer, scheduler, epochs)
106
+ torch.save(self.model, f"Bert_GoEmotions_BS{batch_size}_E{epochs}.model")
107
+
108
+
109
+ def train(self, train_dataloader, optimizer, scheduler, epochs):
110
+ # This training code is based on the `run_glue.py` script here:
111
+ # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
112
+
113
+ # Measure the total training time for the whole run.
114
+ total_t0 = time.time()
115
+
116
+ # For each epoch...
117
+ for epoch_i in range(epochs):
118
+ print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
119
+ print('Training...')
120
+
121
+ # Measure how long the training epoch takes.
122
+ t0 = time.time()
123
+
124
+ # Reset the total loss for this epoch.
125
+ total_train_loss = 0
126
+
127
+ # Put the model into training mode. Don't be mislead--the call to
128
+ # `train` just changes the *mode*, it doesn't *perform* the training.
129
+ # `dropout` and `batchnorm` layers behave differently during training
130
+ # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
131
+ self.model.train()
132
+
133
+ # For each batch of training data...
134
+ for step, batch in enumerate(train_dataloader):
135
+
136
+ # Progress update every 40 batches.
137
+ if step % 40 == 0 and step != 0:
138
+ # Calculate elapsed time in minutes.
139
+ elapsed = self.format_time(time.time() - t0)
140
+
141
+ # Report progress.
142
+ print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
143
+
144
+ # Unpack this training batch from our dataloader.
145
+ #
146
+ # As we unpack the batch, we'll also copy each tensor to the GPU using the
147
+ # `to` method.
148
+ #
149
+ # `batch` contains three pytorch tensors:
150
+ # [0]: input ids
151
+ # [1]: attention masks
152
+ # [2]: labels
153
+ b_input_ids = batch[0].to(self.device)
154
+ b_input_mask = batch[1].to(self.device)
155
+ b_labels = batch[2].to(self.device)
156
+
157
+ # Always clear any previously calculated gradients before performing a
158
+ # backward pass. PyTorch doesn't do this automatically because
159
+ # accumulating the gradients is "convenient while training RNNs".
160
+ # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
161
+ self.model.zero_grad()
162
+
163
+ # Perform a forward pass (evaluate the model on this training batch).
164
+ # The documentation for this `model` function is here:
165
+ # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
166
+ # It returns different numbers of parameters depending on what arguments
167
+ # arge given and what flags are set. For our useage here, it returns
168
+ # the loss (because we provided labels) and the "logits"--the model
169
+ # outputs prior to activation.
170
+
171
+ output = self.model(b_input_ids,
172
+ token_type_ids=None,
173
+ attention_mask=b_input_mask,
174
+ labels=b_labels)
175
+
176
+
177
+ loss = output.loss
178
+ logits = output.logits
179
+
180
+ # Accumulate the training loss over all of the batches so that we can
181
+ # calculate the average loss at the end. `loss` is a Tensor containing a
182
+ # single value; the `.item()` function just returns the Python value
183
+ # from the tensor.
184
+ total_train_loss += loss.item()
185
+
186
+ # Perform a backward pass to calculate the gradients.
187
+ loss.backward()
188
+
189
+ # Clip the norm of the gradients to 1.0.
190
+ # This is to help prevent the "exploding gradients" problem.
191
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
192
+
193
+ # Update parameters and take a step using the computed gradient.
194
+ # The optimizer dictates the "update rule"--how the parameters are
195
+ # modified based on their gradients, the learning rate, etc.
196
+ optimizer.step()
197
+
198
+ # Update the learning rate.
199
+ scheduler.step()
200
+
201
+ # Calculate the average loss over all of the batches.
202
+ avg_train_loss = total_train_loss / len(train_dataloader)
203
+
204
+ # Measure how long this epoch took.
205
+ training_time = self.format_time(time.time() - t0)
206
+
207
+ print("")
208
+ print(" Average training loss: {0:.2f}".format(avg_train_loss))
209
+ print(" Training epoch took: {:}".format(training_time))
210
+
211
+ print("")
212
+ print("Training complete!")
213
+
214
+ print("Total training took {:} (h:mm:ss)".format(self.format_time(time.time()-total_t0)))
215
+
216
+ def evaluate(self, sentences:list):
217
+ input_ids = []
218
+ attention_masks = []
219
+
220
+ for sent in sentences:
221
+ input_id, attention_mask = self.tokenizeText(sent)
222
+ input_ids.append(input_id)
223
+ attention_masks.append(attention_mask)
224
+
225
+ input_ids = torch.cat(input_ids, dim=0)
226
+ attention_masks = torch.cat(attention_masks, dim=0)
227
+ labels = torch.zeros(len(sentences))
228
+
229
+ batch_size = 32
230
+
231
+ prediction_data = TensorDataset(input_ids, attention_masks, labels)
232
+ prediction_sampler = SequentialSampler(prediction_data)
233
+ prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
234
+
235
+ self.model.eval()
236
+
237
+ predictions = []
238
+
239
+ for batch in prediction_dataloader:
240
+ batch = tuple(t.to(self.device) for t in batch)
241
+
242
+ b_input_ids, b_input_mask, _ = batch
243
+
244
+ with torch.no_grad():
245
+ outputs = self.model(b_input_ids, token_type_ids=None,
246
+ attention_mask=b_input_mask)
247
+
248
+ logits = outputs[0]
249
+
250
+ logits = logits.detach().cpu().numpy()
251
+ predictions.append(logits)
252
+
253
+ # print(predictions)
254
+ return [predictions[0][i].argmax() for i, x in enumerate(sentences)]
255
+
256
+
257
+
src/utils/utilities.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ class Utility:
4
+
5
+ def read_emotion_list(self):
6
+ with open('./emotion_mapping_finalized.json') as emo_mapping_file:
7
+ finalized_emotions = json.load(emo_mapping_file)
8
+ emotions_mapping = {}
9
+ for key, values in finalized_emotions.items():
10
+ for emotion in values:
11
+ emotions_mapping[emotion] = key
12
+ return list(finalized_emotions.keys())