Spaces:

jeevavijay10
/

nlp-goemotions-senti-pred

Runtime error

App Files Files Community

cspocketindia commited on May 14, 2023

Commit

01f65eb

•

0 Parent(s):

first commit

Browse files

Files changed (10) hide show

.gitignore +7 -0
emotion_mapping_finalized.json +16 -0
gradio_app.py +23 -0
requirements.txt +17 -0
src/data_loader/go_emotions.py +19 -0
src/models/__pycache__/bert.cpython-39.pyc +0 -0
src/models/bert.py +256 -0
src/utils/__pycache__/utilities.cpython-39.pyc +0 -0
src/utils/utilities.py +12 -0
src/views/index.html +10 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+/app.py
+/Bert_GoEmotions_4Epochs.model
+/curl_gradio.bat
+/Dockerfile
+/flagged
+/run_gradio_client.py
+/streamlit_app.py

emotion_mapping_finalized.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "joy": ["joy", "amusement", "excitement"],
+    "desire": ["desire"],
+    "pride": ["pride", "admiration", "relief"],
+    "agreement": ["approval", "realization"],
+    "surprise": ["surprise", "curiosity"],
+    "love": ["love", "caring"],
+    "confusion": ["confusion"],
+    "anger": ["anger", "disapproval"],
+    "disgust": ["disgust", "annoyance"],
+    "sadness": ["sadness", "grief", "remorse", "embarrassment"],
+    "fear": ["fear", "nervousness"],
+    "optimism": ["optimism", "gratitude"],
+    "disappointment": ["disappointment"],
+    "neutral": ["neutral"]
+}

gradio_app.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import gradio
+from src.models.bert import BERTClassifier
+from src.utils.utilities import Utility
+model = BERTClassifier(model_name='Bert_GoEmotions_4Epochs.model')
+classes = Utility().read_emotion_list()
+def predict(sentence):
+    print(sentence)
+    predictions = model.evaluate([sentence])
+    print(f"Predictions: {predictions}")
+    return classes[predictions[0]]
+gradio.Interface(
+    fn=predict,
+    inputs="text",
+    outputs="text",
+    allow_flagging='auto',
+    flagging_dir='logs',
+    flagging_callback=gradio.SimpleCSVLogger(),
+).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+# pyarrow
+# plotly
+# nbformat
+# gensim
+# keras
+pandas
+seaborn
+nltk
+wordcloud
+tensorflow
+tensorflow_hub
+transformers
+flask
+torch
+torchvision
+scikit-learn
+numpy

src/data_loader/go_emotions.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import os
+import torch
+import pickle
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+from ..utils.utilities import Utility
+class GoEmotionsDataset(torch.utils.data.Dataset):
+    util = Utility()
+    def __init__(self, embeddings, labels):
+        self.labels = labels
+        self.instances = embeddings
+    def __len__(self):
+        return self.instances.shape[0]
+    def __getitem__(self, idx):
+        return self.instances[idx], self.labels[idx]

src/models/__pycache__/bert.cpython-39.pyc ADDED Viewed

Binary file (5.74 kB). View file

src/models/bert.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import time
+import datetime
+import torch
+import numpy as np
+import tqdm
+import random
+from torch import nn
+from transformers import RobertaTokenizer, RobertaModel, AdamW, RobertaConfig
+from sklearn.model_selection import train_test_split
+from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
+from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
+class BERTClassifier():
+    def __init__(self, model_name="bert-base-uncased", tokenizer_name="bert-base-uncased") -> None:
+        print(f'Loading BERT tokenizer:{model_name}...')
+        self.model_name = model_name
+        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
+        if model_name.endswith('.model'):
+            self.model = torch.load(model_name)
+            torch.save(self.model.cpu(), model_name)
+        else:
+            self.model = BertForSequenceClassification.from_pretrained(
+                self.model_name,
+                num_labels=14,
+                output_attentions=False,
+                output_hidden_states=False
+            )
+        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        self.model.to(self.device)
+    def tokenizeText(self, sentence: str):
+        # return self.tokenizer.encode(sentence, add_special_tokens=True)
+        encoded_dict = self.tokenizer.encode_plus(
+            sentence,
+            add_special_tokens=True,
+            max_length=64,
+            pad_to_max_length=True,
+            return_attention_mask=True,
+            return_tensors='pt')
+        return encoded_dict['input_ids'], encoded_dict['attention_mask']
+    def tokenizeSentences(self, sentences: list, labels: list):
+        input_ids = []
+        attention_masks = []
+        for sent in sentences:
+            input_id, attention_mask = self.tokenizeText(sent)
+            input_ids.append(input_id)
+            attention_masks.append(attention_mask)
+        input_ids = torch.cat(input_ids, dim=0)
+        attention_masks = torch.cat(attention_masks, dim=0)
+        dataset = TensorDataset(input_ids, attention_masks, labels)
+        train_size = int(0.9 * len(dataset))
+        val_size = len(dataset) - train_size
+        return random_split(dataset, [train_size, val_size])
+    def flat_accuracy(self, preds, labels):
+        pred_flat = np.argmax(preds, axis=1).flatten()
+        labels_flat = labels.flatten()
+        return np.sum(pred_flat == labels_flat) / len(labels_flat)
+    def format_time(self, elapsed):
+        # Round to the nearest second.
+        elapsed_rounded = int(round((elapsed)))
+        # Format as hh:mm:ss
+        return str(datetime.timedelta(seconds=elapsed_rounded))
+    def trainModel(self, sentences: list, labels: list, epochs=4, batch_size=32):
+        optimizer = AdamW(self.model.parameters(), lr=2e-5, eps=1e-8)
+        train_dataset, val_dataset = self.tokenizeSentences(sentences, labels)
+        train_dataloader = DataLoader(
+            train_dataset,
+            sampler=RandomSampler(train_dataset),
+            batch_size=batch_size
+        )
+        validation_dataloader = DataLoader(
+            val_dataset,
+            sampler=SequentialSampler(val_dataset),
+            batch_size=batch_size
+        )
+        total_steps = len(train_dataloader) * epochs
+        # Create the learning rate scheduler.
+        scheduler = get_linear_schedule_with_warmup(optimizer,
+                                                    num_warmup_steps=0,  # Default value in run_glue.py
+                                                    num_training_steps=total_steps)
+        self.train(train_dataloader, optimizer, scheduler, epochs)
+        torch.save(self.model, f"Bert_GoEmotions_BS{batch_size}_E{epochs}.model")
+    def train(self, train_dataloader, optimizer, scheduler, epochs):
+        # This training code is based on the `run_glue.py` script here:
+        # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
+        # Measure the total training time for the whole run.
+        total_t0 = time.time()
+        # For each epoch...
+        for epoch_i in range(epochs):
+            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+            print('Training...')
+            # Measure how long the training epoch takes.
+            t0 = time.time()
+            # Reset the total loss for this epoch.
+            total_train_loss = 0
+            # Put the model into training mode. Don't be mislead--the call to
+            # `train` just changes the *mode*, it doesn't *perform* the training.
+            # `dropout` and `batchnorm` layers behave differently during training
+            # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
+            self.model.train()
+            # For each batch of training data...
+            for step, batch in enumerate(train_dataloader):
+                # Progress update every 40 batches.
+                if step % 40 == 0 and step != 0:
+                    # Calculate elapsed time in minutes.
+                    elapsed = self.format_time(time.time() - t0)
+                    # Report progress.
+                    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
+                # Unpack this training batch from our dataloader.
+                #
+                # As we unpack the batch, we'll also copy each tensor to the GPU using the
+                # `to` method.
+                #
+                # `batch` contains three pytorch tensors:
+                #   [0]: input ids
+                #   [1]: attention masks
+                #   [2]: labels
+                b_input_ids = batch[0].to(self.device)
+                b_input_mask = batch[1].to(self.device)
+                b_labels = batch[2].to(self.device)
+                # Always clear any previously calculated gradients before performing a
+                # backward pass. PyTorch doesn't do this automatically because
+                # accumulating the gradients is "convenient while training RNNs".
+                # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
+                self.model.zero_grad()
+                # Perform a forward pass (evaluate the model on this training batch).
+                # The documentation for this `model` function is here:
+                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+                # It returns different numbers of parameters depending on what arguments
+                # arge given and what flags are set. For our useage here, it returns
+                # the loss (because we provided labels) and the "logits"--the model
+                # outputs prior to activation.
+                output = self.model(b_input_ids,
+                                    token_type_ids=None,
+                                    attention_mask=b_input_mask,
+                                    labels=b_labels)
+                loss = output.loss
+                logits = output.logits
+                # Accumulate the training loss over all of the batches so that we can
+                # calculate the average loss at the end. `loss` is a Tensor containing a
+                # single value; the `.item()` function just returns the Python value
+                # from the tensor.
+                total_train_loss += loss.item()
+                # Perform a backward pass to calculate the gradients.
+                loss.backward()
+                # Clip the norm of the gradients to 1.0.
+                # This is to help prevent the "exploding gradients" problem.
+                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
+                # Update parameters and take a step using the computed gradient.
+                # The optimizer dictates the "update rule"--how the parameters are
+                # modified based on their gradients, the learning rate, etc.
+                optimizer.step()
+                # Update the learning rate.
+                scheduler.step()
+            # Calculate the average loss over all of the batches.
+            avg_train_loss = total_train_loss / len(train_dataloader)
+            # Measure how long this epoch took.
+            training_time = self.format_time(time.time() - t0)
+            print("")
+            print("  Average training loss: {0:.2f}".format(avg_train_loss))
+            print("  Training epoch took: {:}".format(training_time))
+        print("")
+        print("Training complete!")
+        print("Total training took {:} (h:mm:ss)".format(self.format_time(time.time()-total_t0)))
+    def evaluate(self, sentences:list):
+        input_ids = []
+        attention_masks = []
+        for sent in sentences:
+            input_id, attention_mask = self.tokenizeText(sent)
+            input_ids.append(input_id)
+            attention_masks.append(attention_mask)
+        input_ids = torch.cat(input_ids, dim=0)
+        attention_masks = torch.cat(attention_masks, dim=0)
+        labels = torch.zeros(len(sentences))
+        batch_size = 32
+        prediction_data = TensorDataset(input_ids, attention_masks, labels)
+        prediction_sampler = SequentialSampler(prediction_data)
+        prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
+        self.model.eval()
+        predictions = []
+        for batch in prediction_dataloader:
+            batch = tuple(t.to(self.device) for t in batch)
+            b_input_ids, b_input_mask, _ = batch
+            with torch.no_grad():
+                outputs = self.model(b_input_ids, token_type_ids=None,
+                                attention_mask=b_input_mask)
+            logits = outputs[0]
+            logits = logits.detach().cpu().numpy()
+            predictions.append(logits)
+        # print(predictions)
+        return [predictions[0][i].argmax() for i, x in enumerate(sentences)]

src/utils/__pycache__/utilities.cpython-39.pyc ADDED Viewed

Binary file (689 Bytes). View file

src/utils/utilities.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import json
+class Utility:
+    def read_emotion_list(self):
+        with open('./emotion_mapping_finalized.json') as emo_mapping_file:
+            finalized_emotions = json.load(emo_mapping_file)
+        emotions_mapping = {}
+        for key, values in finalized_emotions.items():
+            for emotion in values:
+                emotions_mapping[emotion] = key
+        return list(finalized_emotions.keys())

src/views/index.html ADDED Viewed

	@@ -0,0 +1,10 @@

+<!DOCTYPE html>
+<html>
+    <body>
+        <h1>Sentiment Prediction</h1>
+        <form action = "/predict" method = "POST">
+            <input type = "text" name = "text" /></p>
+            <p><input type = "submit" value = "PREDICT" /></p>
+         </form>
+    </body>
+</html>