Spaces:

Azhageswari
/

nlp-goemotion-sentimentanalysis

Sleeping

App Files Files Community

Azhageswari commited on Dec 22, 2023

Commit

8eed319

1 Parent(s): 9619abc

Upload 6 files

Browse files

Files changed (6) hide show

README.md +7 -6
emotion_mapping_finalized.json +16 -0
gradio_app.py +75 -0
requirements.txt +18 -0
src/models/bert.py +257 -0
src/utils/utilities.py +12 -0

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
 ---
-title: Nlp Goemotion Sentimentanalysis
-emoji: 👀
-colorFrom: pink
-colorTo: blue
 sdk: gradio
-sdk_version: 4.11.0
-app_file: app.py
 pinned: false
 license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: NLP Sentiment Prediction with GoEmotions
+emoji: 🏃
+colorFrom: gray
+colorTo: gray
 sdk: gradio
+sdk_version: 3.29.0
+app_file: gradio_app.py
 pinned: false
 license: apache-2.0
+python_version: 3.9.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

emotion_mapping_finalized.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "joy": ["joy", "amusement", "excitement"],
+    "desire": ["desire"],
+    "pride": ["pride", "admiration", "relief"],
+    "agreement": ["approval", "realization"],
+    "surprise": ["surprise", "curiosity"],
+    "love": ["love", "caring"],
+    "confusion": ["confusion"],
+    "anger": ["anger", "disapproval"],
+    "disgust": ["disgust", "annoyance"],
+    "sadness": ["sadness", "grief", "remorse", "embarrassment"],
+    "fear": ["fear", "nervousness"],
+    "optimism": ["optimism", "gratitude"],
+    "disappointment": ["disappointment"],
+    "neutral": ["neutral"]
+}

gradio_app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import os
+import time
+import csv
+import datetime
+import gradio
+import schedule
+from gradio import utils
+import huggingface_hub
+from pathlib import Path
+from src.models.bert import BERTClassifier
+from src.utils.utilities import Utility
+model = BERTClassifier(model_name='jeevavijay10/nlp-goemotions-bert')
+classes = Utility().read_emotion_list()
+hf_token = os.getenv("HF_TOKEN")
+dataset_dir = "logs"
+headers = ["input", "output", "timestamp", "elapsed"]
+repo = huggingface_hub.Repository(
+    local_dir=dataset_dir,
+    clone_from="https://huggingface.co/datasets/jeevavijay10/senti-pred-gradio",
+    token=hf_token,
+)
+repo.git_pull(lfs=True)
+def log_record(vals):
+    log_file = Path(dataset_dir) / "data.csv"
+    is_new = not Path(log_file).exists()
+    with open(log_file, "a", newline="", encoding="utf-8") as csvfile:
+        writer = csv.writer(csvfile)
+        if is_new:
+            writer.writerow(utils.sanitize_list_for_csv(headers))
+        writer.writerow(utils.sanitize_list_for_csv(vals))
+    schedule.run_pending()
+    print(f"Last Sync: {job.last_run}")
+def predict(sentence):
+    timestamp = datetime.datetime.now().isoformat()
+    start_time = time.time()
+    predictions = model.evaluate([sentence])
+    elapsed_time = time.time() - start_time
+    output = classes[predictions[0]]
+    print(f"Sentence: {sentence} \nPrediction: {predictions[0]} - {output}")
+    log_record([sentence, output, timestamp, str(elapsed_time)])
+    return output
+def sync_logs():
+    print(f"Repo Clean: {repo.is_repo_clean()}")
+    if not repo.is_repo_clean():
+        repo.git_add()
+        repo.git_commit()
+        repo.git_pull(lfs=True)
+        result = repo.git_push()
+        # result = repo.push_to_hub()
+        print(result)
+job = schedule.every(5).minutes.do(sync_logs)
+print("Scheduler engaged")
+gradio.Interface(
+    fn=predict,
+    inputs="text",
+    outputs="text",
+    allow_flagging='never'
+).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+# pyarrow
+# plotly
+# nbformat
+# gensim
+# keras
+pandas
+seaborn
+nltk
+wordcloud
+tensorflow
+tensorflow_hub
+transformers
+flask
+torch
+torchvision
+scikit-learn
+numpy
+schedule

src/models/bert.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import time
+import datetime
+import torch
+import numpy as np
+import tqdm
+import random
+from torch import nn
+from transformers import RobertaTokenizer, RobertaModel, AdamW, RobertaConfig
+from sklearn.model_selection import train_test_split
+from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, AutoModel, AutoTokenizer
+from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
+class BERTClassifier():
+    def __init__(self, model_name="bert-base-uncased", tokenizer_name="bert-base-uncased") -> None:
+        print(f'Loading BERT:{model_name}...')
+        self.model_name = model_name
+        # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
+        if model_name.startswith('jeevavijay10'):
+            # self.model = torch.load(model_name)
+            self.model = BertForSequenceClassification.from_pretrained(model_name)
+        else:
+            self.model = BertForSequenceClassification.from_pretrained(
+                self.model_name,
+                num_labels=14,
+                output_attentions=False,
+                output_hidden_states=False
+            )
+        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        self.model.to(self.device)
+    def tokenizeText(self, sentence: str):
+        # return self.tokenizer.encode(sentence, add_special_tokens=True)
+        encoded_dict = self.tokenizer.encode_plus(
+            sentence,
+            add_special_tokens=True,
+            max_length=64,
+            pad_to_max_length=True,
+            return_attention_mask=True,
+            return_tensors='pt')
+        return encoded_dict['input_ids'], encoded_dict['attention_mask']
+    def tokenizeSentences(self, sentences: list, labels: list):
+        input_ids = []
+        attention_masks = []
+        for sent in sentences:
+            input_id, attention_mask = self.tokenizeText(sent)
+            input_ids.append(input_id)
+            attention_masks.append(attention_mask)
+        input_ids = torch.cat(input_ids, dim=0)
+        attention_masks = torch.cat(attention_masks, dim=0)
+        dataset = TensorDataset(input_ids, attention_masks, labels)
+        train_size = int(0.9 * len(dataset))
+        val_size = len(dataset) - train_size
+        return random_split(dataset, [train_size, val_size])
+    def flat_accuracy(self, preds, labels):
+        pred_flat = np.argmax(preds, axis=1).flatten()
+        labels_flat = labels.flatten()
+        return np.sum(pred_flat == labels_flat) / len(labels_flat)
+    def format_time(self, elapsed):
+        # Round to the nearest second.
+        elapsed_rounded = int(round((elapsed)))
+        # Format as hh:mm:ss
+        return str(datetime.timedelta(seconds=elapsed_rounded))
+    def trainModel(self, sentences: list, labels: list, epochs=4, batch_size=32):
+        optimizer = AdamW(self.model.parameters(), lr=2e-5, eps=1e-8)
+        train_dataset, val_dataset = self.tokenizeSentences(sentences, labels)
+        train_dataloader = DataLoader(
+            train_dataset,
+            sampler=RandomSampler(train_dataset),
+            batch_size=batch_size
+        )
+        validation_dataloader = DataLoader(
+            val_dataset,
+            sampler=SequentialSampler(val_dataset),
+            batch_size=batch_size
+        )
+        total_steps = len(train_dataloader) * epochs
+        # Create the learning rate scheduler.
+        scheduler = get_linear_schedule_with_warmup(optimizer,
+                                                    num_warmup_steps=0,  # Default value in run_glue.py
+                                                    num_training_steps=total_steps)
+        self.train(train_dataloader, optimizer, scheduler, epochs)
+        torch.save(self.model, f"Bert_GoEmotions_BS{batch_size}_E{epochs}.model")
+    def train(self, train_dataloader, optimizer, scheduler, epochs):
+        # This training code is based on the `run_glue.py` script here:
+        # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
+        # Measure the total training time for the whole run.
+        total_t0 = time.time()
+        # For each epoch...
+        for epoch_i in range(epochs):
+            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+            print('Training...')
+            # Measure how long the training epoch takes.
+            t0 = time.time()
+            # Reset the total loss for this epoch.
+            total_train_loss = 0
+            # Put the model into training mode. Don't be mislead--the call to
+            # `train` just changes the *mode*, it doesn't *perform* the training.
+            # `dropout` and `batchnorm` layers behave differently during training
+            # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
+            self.model.train()
+            # For each batch of training data...
+            for step, batch in enumerate(train_dataloader):
+                # Progress update every 40 batches.
+                if step % 40 == 0 and step != 0:
+                    # Calculate elapsed time in minutes.
+                    elapsed = self.format_time(time.time() - t0)
+                    # Report progress.
+                    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
+                # Unpack this training batch from our dataloader.
+                #
+                # As we unpack the batch, we'll also copy each tensor to the GPU using the
+                # `to` method.
+                #
+                # `batch` contains three pytorch tensors:
+                #   [0]: input ids
+                #   [1]: attention masks
+                #   [2]: labels
+                b_input_ids = batch[0].to(self.device)
+                b_input_mask = batch[1].to(self.device)
+                b_labels = batch[2].to(self.device)
+                # Always clear any previously calculated gradients before performing a
+                # backward pass. PyTorch doesn't do this automatically because
+                # accumulating the gradients is "convenient while training RNNs".
+                # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
+                self.model.zero_grad()
+                # Perform a forward pass (evaluate the model on this training batch).
+                # The documentation for this `model` function is here:
+                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+                # It returns different numbers of parameters depending on what arguments
+                # arge given and what flags are set. For our useage here, it returns
+                # the loss (because we provided labels) and the "logits"--the model
+                # outputs prior to activation.
+                output = self.model(b_input_ids,
+                                    token_type_ids=None,
+                                    attention_mask=b_input_mask,
+                                    labels=b_labels)
+                loss = output.loss
+                logits = output.logits
+                # Accumulate the training loss over all of the batches so that we can
+                # calculate the average loss at the end. `loss` is a Tensor containing a
+                # single value; the `.item()` function just returns the Python value
+                # from the tensor.
+                total_train_loss += loss.item()
+                # Perform a backward pass to calculate the gradients.
+                loss.backward()
+                # Clip the norm of the gradients to 1.0.
+                # This is to help prevent the "exploding gradients" problem.
+                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
+                # Update parameters and take a step using the computed gradient.
+                # The optimizer dictates the "update rule"--how the parameters are
+                # modified based on their gradients, the learning rate, etc.
+                optimizer.step()
+                # Update the learning rate.
+                scheduler.step()
+            # Calculate the average loss over all of the batches.
+            avg_train_loss = total_train_loss / len(train_dataloader)
+            # Measure how long this epoch took.
+            training_time = self.format_time(time.time() - t0)
+            print("")
+            print("  Average training loss: {0:.2f}".format(avg_train_loss))
+            print("  Training epoch took: {:}".format(training_time))
+        print("")
+        print("Training complete!")
+        print("Total training took {:} (h:mm:ss)".format(self.format_time(time.time()-total_t0)))
+    def evaluate(self, sentences:list):
+        input_ids = []
+        attention_masks = []
+        for sent in sentences:
+            input_id, attention_mask = self.tokenizeText(sent)
+            input_ids.append(input_id)
+            attention_masks.append(attention_mask)
+        input_ids = torch.cat(input_ids, dim=0)
+        attention_masks = torch.cat(attention_masks, dim=0)
+        labels = torch.zeros(len(sentences))
+        batch_size = 32
+        prediction_data = TensorDataset(input_ids, attention_masks, labels)
+        prediction_sampler = SequentialSampler(prediction_data)
+        prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
+        self.model.eval()
+        predictions = []
+        for batch in prediction_dataloader:
+            batch = tuple(t.to(self.device) for t in batch)
+            b_input_ids, b_input_mask, _ = batch
+            with torch.no_grad():
+                outputs = self.model(b_input_ids, token_type_ids=None,
+                                attention_mask=b_input_mask)
+            logits = outputs[0]
+            logits = logits.detach().cpu().numpy()
+            predictions.append(logits)
+        # print(predictions)
+        return [predictions[0][i].argmax() for i, x in enumerate(sentences)]

src/utils/utilities.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import json
+class Utility:
+    def read_emotion_list(self):
+        with open('./emotion_mapping_finalized.json') as emo_mapping_file:
+            finalized_emotions = json.load(emo_mapping_file)
+        emotions_mapping = {}
+        for key, values in finalized_emotions.items():
+            for emotion in values:
+                emotions_mapping[emotion] = key
+        return list(finalized_emotions.keys())