Spaces:

EuroPython2022
/

latr-vqa

Runtime error

App Files Files Community

uakarsh commited on Jul 14, 2022

Commit

c017f2e

•

1 Parent(s): 31dbc87

Add application file

Browse files

Files changed (6) hide show

app.py +148 -0
dataset.py +150 -0
modeling.py +251 -0
packages.txt +1 -0
requirements.txt +16 -0
utils.py +116 -0

app.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# Requirements.txt
+from torch import cuda
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+import gradio as gr
+from utils import convert_ans_to_token, convert_ques_to_token, rotate, convert_token_to_ques, convert_token_to_answer
+from modeling import LaTr_for_pretraining, LaTr_for_finetuning, LaTrForVQA
+from dataset import load_json_file, get_specific_file, resize_align_bbox, get_tokens_with_boxes, create_features
+import torch.nn as nn
+from PIL import Image, ImageDraw
+import pytesseract
+from tqdm.auto import tqdm
+import numpy as np
+import json
+import os
+import torch
+from torchvision import transforms
+# install PyTesseract
+os.system('pip install -q pytesseract')
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Default Library import
+# Visualization libraries
+# Specific libraries of LaTr
+# Setting the hyperparameters as well as primary configurations
+PAD_TOKEN_BOX = [0, 0, 0, 0]
+max_seq_len = 512
+batch_size = 2
+target_size = (500, 384)
+t5_model = "t5-base"
+device = 'cuda' if cuda.is_available() else 'cpu'
+# Configuration for the model
+config = {
+    't5_model': 't5-base',
+    'vocab_size': 32128,
+    'hidden_state': 768,
+    'max_2d_position_embeddings': 1001,
+    'classes': 32128,  # number of tokens
+    'seq_len': 512
+}
+tokenizer = T5Tokenizer.from_pretrained(t5_model)
+latr = LaTrForVQA(config)
+url = 'https://www.kaggleusercontent.com/kf/99663112/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..2HGa6jqeAbugMJYxSkh7eA.XkaLSf8XlITet17Bscupegw9zWLw-IEizSy1lM-_PJF_Gfj-YuinOpDw4ad0M8r-s3WlnclQhHYrd2seaZVjBmkm5WSE6Dae1fW54dnNhyWF5w5O2VafNar7QSuUTSRzacJcmtqI1ypL3OZofwXuETbXq4weeqfDptFS5luxuV0P4Vaer_xEgfsdld6v8O5jjMXwb1CVmPCjMdZUE-HTgzTDiwv3Lb-P3dkRgU7q-iI5GeYZCODYGrX-koxya9DlfzKQZXmJmvtMj45vUZ8OSRB0_hTc7UosQanA-SalWznnOuyOgwl4hMag5toTomriWsxfvJIRBn9CYgFcvUJNqO_kDzBUoAwnagjcxXeEIJTJglwAl9Rs37XyfJAZr7yQ_YTXeRW1j2QMsT_M3qtS96IKRTpsqPVibl8Vrs9Q5g_vKccIQR9t7R9ma_DZLwjWYhDvDO06AZqtdaYGfWaOrbqe8dDvJkZoHsZEO8ukpIH6YNLyCO_dqgRsE77I9jqxiUqQh1KnuNv2hGRSlQR7u8OF7lpiRS7JEwj2MaxlzD58dyhOOLDqrbLp7XWrgV79EQcRYHFSMfhDvG0zmGvHjWGAg-LGhnYIc0NMVhyRv5Pfta9WYEl4qXxCTZWe4olgV79WHLqksQMVyTteheB36n4biHZKx4KZj7k-j3aSI72DIAvj7_UFeHxUTTZ1c6MB.7BF6J5MPMuhQFU48xVZ2qQ/models/epoch=0-step=34602.ckpt'
+try:
+    latr = latr.load_from_checkpoint(url)
+    print("Checkpoint loaded successfully")
+except:
+    print("Checkpoint not loaded")
+    pass
+image = gr.inputs.Image(type="pil")
+question = gr.inputs.Textbox(label="Question")
+answer = gr.outputs.Textbox(label="Predicted answer")
+examples = [["remote.jpg", "what number is the button near the top left?"]]
+from transformers import ViTFeatureExtractor, ViTModel
+vit_feat_extract = ViTFeatureExtractor("google/vit-base-patch16-224-in21k")
+import torchvision
+import numpy as np
+def answer_question(image, question):
+    # Extracting features from the image
+    image.save("sample.png")
+    img, boxes, tokenized_words = create_features("sample.png",
+                                                  tokenizer=tokenizer,
+                                                  target_size=target_size,
+                                                  max_seq_length=max_seq_len,
+                                                  use_ocr=True
+                                                  )
+    ## Converting the boxes as per the format required for model input
+    boxes = torch.as_tensor(boxes, dtype=torch.int32)
+    width = (boxes[:, 2] - boxes[:, 0]).view(-1, 1)
+    height = (boxes[:, 3] - boxes[:, 1]).view(-1, 1)
+    boxes = torch.cat([boxes, width, height], axis = -1)
+    ## Clamping the value,as some of the box values are out of bound
+    boxes[:, 0] = torch.clamp(boxes[:, 0], min = 0, max = 0)
+    boxes[:, 2] = torch.clamp(boxes[:, 2], min = 1000, max = 1000)
+    boxes[:, 4] = torch.clamp(boxes[:, 4], min = 1000, max = 1000)
+    boxes[:, 1] = torch.clamp(boxes[:, 1], min = 0, max = 0)
+    boxes[:, 3] = torch.clamp(boxes[:, 3], min = 1000, max = 1000)
+    boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000)
+    ## Tensor tokenized words
+    tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32)
+    img = np.array(img)
+    img = torchvision.transforms.ToTensor()(img)
+    question = convert_ques_to_token(question = question, tokenizer = tokenizer)
+    ## Expanding the dimension for inference
+    boxes = boxes.unsqueeze(0)
+    tokenized_words = tokenized_words.unsqueeze(0)
+    question = question.unsqueeze(0)
+    # print("Shape of Image is:", img.shape)
+    img = vit_feat_extract(img, return_tensors = 'pt')['pixel_values']
+    if int(len(img.shape)) == 3:
+      img = img.unsqueeze(0)
+    encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question}
+    with torch.no_grad():
+        logits = latr.forward(encoding)
+        logits = logits.squeeze(0)
+    _, preds = torch.max(logits, dim = 1)
+    preds = preds.detach().cpu()
+    mask = torch.clamp(preds, min = 0, max = 1)
+    last_non_zero_argument = (mask != 0).nonzero()[1][-1]
+    predicted_ans = convert_token_to_ques(preds[:last_non_zero_argument], tokenizer)
+    return predicted_ans
+# Taken from here: https://huggingface.co/spaces/nielsr/vilt-vqa/blob/main/app.py
+title = "Interactive demo: LaTr (Layout Aware Transformer) for VQA"
+description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
+article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.12494' target='_blank'>LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA)</a> | <a href='https://github.com/uakarsh/latr' target='_blank'>Github Repo</a></p>"
+examples = [['remote.png', "Is remote present in the picture?"]]
+interface = gr.Interface(fn=answer_question,
+                         inputs=[image, question],
+                         outputs=answer,
+                         examples=examples,
+                         title=title,
+                         description=description,
+                         article=article,
+                         enable_queue=True)
+interface.launch(debug=True)

dataset.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+import json
+import numpy as np
+import pytesseract
+from PIL import Image, ImageDraw
+PAD_TOKEN_BOX = [0, 0, 0, 0]
+max_seq_len = 512
+## Function: 1
+## Purpose: Resize and align the bounding box for the different sized image
+def resize_align_bbox(bbox, orig_w, orig_h, target_w, target_h):
+    x_scale = target_w / orig_w
+    y_scale = target_h / orig_h
+    orig_left, orig_top, orig_right, orig_bottom = bbox
+    x = int(np.round(orig_left * x_scale))
+    y = int(np.round(orig_top * y_scale))
+    xmax = int(np.round(orig_right * x_scale))
+    ymax = int(np.round(orig_bottom * y_scale))
+    return [x, y, xmax, ymax]
+## Function: 2
+## Purpose: Reading the json file from the path and return the dictionary
+def load_json_file(file_path):
+  with open(file_path, 'r') as f:
+    data = json.load(f)
+  return data
+## Function: 3
+## Purpose: Getting the address of specific file type, eg: .pdf, .tif, so and so
+def get_specific_file(path, last_entry = 'tif'):
+  base_path = path
+  for i in os.listdir(path):
+    if i.endswith(last_entry):
+      return os.path.join(base_path, i)
+  return '-1'
+## Function: 4
+def get_tokens_with_boxes(unnormalized_word_boxes, list_of_words, tokenizer, pad_token_id = 0, pad_token_box = [0, 0, 0, 0], max_seq_len = 512):
+    '''
+    This function returns two items:
+    1. unnormalized_token_boxes -> a list of len = max_seq_len, containing the boxes corresponding to the tokenized words,
+                                    one box might repeat as per the tokenization procedure
+    2. tokenized_words -> tokenized words corresponding to the tokenizer and the list_of_words
+    '''
+    assert len(unnormalized_word_boxes) == len(list_of_words), "Bounding box length!= total words length"
+    length_of_box = len(unnormalized_word_boxes)
+    unnormalized_token_boxes = []
+    tokenized_words = []
+    for box, word in zip(unnormalized_word_boxes, list_of_words):
+      current_tokens = tokenizer(word, add_special_tokens = False).input_ids
+      unnormalized_token_boxes.extend([box]*len(current_tokens))
+      tokenized_words.extend(current_tokens)
+    if len(unnormalized_token_boxes)<max_seq_len:
+        unnormalized_token_boxes.extend([pad_token_box] * (max_seq_len-len(unnormalized_token_boxes)))
+    if len(tokenized_words)< max_seq_len:
+        tokenized_words.extend([pad_token_id]* (max_seq_len-len(tokenized_words)))
+    return unnormalized_token_boxes[:max_seq_len], tokenized_words[:max_seq_len]
+## Function: 5
+## Function, which would only be used when the below function is used
+def get_topleft_bottomright_coordinates(df_row):
+    left, top, width, height = df_row["left"], df_row["top"], df_row["width"], df_row["height"]
+    return [left, top, left + width, top + height]
+## Function: 6
+## If the OCR is not provided, this function would help in extracting OCR
+def apply_ocr(tif_path):
+    """
+    Returns words and its bounding boxes from an image
+    """
+    img = Image.open(tif_path).convert("RGB")
+    ocr_df = pytesseract.image_to_data(img, output_type="data.frame")
+    ocr_df = ocr_df.dropna().reset_index(drop=True)
+    float_cols = ocr_df.select_dtypes("float").columns
+    ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
+    ocr_df = ocr_df.replace(r"^\s*$", np.nan, regex=True)
+    ocr_df = ocr_df.dropna().reset_index(drop=True)
+    words = list(ocr_df.text.apply(lambda x: str(x).strip()))
+    actual_bboxes = ocr_df.apply(get_topleft_bottomright_coordinates, axis=1).values.tolist()
+    # add as extra columns
+    assert len(words) == len(actual_bboxes)
+    return {"words": words, "bbox": actual_bboxes}
+## Function: 7
+## Merging all the above functions, for the purpose of extracting the image, bounding box and the tokens (sentence wise)
+def create_features(
+    image_path,
+    tokenizer,
+    target_size = (1000, 1000),
+    max_seq_length=512,
+    use_ocr = False,
+    bounding_box = None,
+    words = None
+    ):
+  '''
+  We assume that the bounding box provided are given as per the image scale (i.e not normalized), so that we just need to scale it as per the ratio
+  '''
+  img = Image.open(image_path).convert("RGB")
+  width_old, height_old = img.size
+  img = img.resize(target_size)
+  width, height = img.size
+  ## Rescaling the bounding box as per the image size
+  if (use_ocr == False) and (bounding_box == None or words == None):
+    raise Exception('Please provide the bounding box and words or pass the argument "use_ocr" = True')
+  if use_ocr == True:
+    entries = apply_ocr(image_path)
+    bounding_box = entries["bbox"]
+    words = entries["words"]
+  bounding_box = list(map(lambda x: resize_align_bbox(x,width_old,height_old, width, height), bounding_box))
+  boxes, tokenized_words = get_tokens_with_boxes(unnormalized_word_boxes = bounding_box,
+                                               list_of_words = words,
+                                               tokenizer = tokenizer,
+                                               pad_token_id = 0,
+                                               pad_token_box = PAD_TOKEN_BOX,
+                                               max_seq_len = max_seq_length
+                                               )
+  return img, boxes, tokenized_words

modeling.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import torch.nn as nn
+import torch
+from transformers import T5ForConditionalGeneration, ViTModel
+import pytorch_lightning as pl
+# Defining the pytorch model
+class LaTr_for_pretraining(nn.Module):
+    def __init__(self, config, classify=False):
+        super(LaTr_for_pretraining, self).__init__()
+        self.vocab_size = config['vocab_size']
+        model = T5ForConditionalGeneration.from_pretrained(config['t5_model'])
+        # Removing the Embedding layer
+        dummy_encoder = list(nn.Sequential(
+            *list(model.encoder.children())[1:]).children())
+        # Removing the Embedding Layer
+        dummy_decoder = list(nn.Sequential(
+            *list(model.decoder.children())[1:]).children())
+        # Using the T5 Encoder
+        self.list_encoder = nn.Sequential(*list(dummy_encoder[0]))
+        self.residue_encoder = nn.Sequential(*list(dummy_encoder[1:]))
+        self.list_decoder = nn.Sequential(*list(dummy_decoder[0]))
+        self.residue_decoder = nn.Sequential(*list(dummy_decoder[1:]))
+        # We use the embeddings of T5 for encoding the tokenized words
+        self.language_emb = nn.Embedding.from_pretrained(model.shared.weight)
+        self.top_left_x = nn.Embedding(
+            config['max_2d_position_embeddings'], config['hidden_state'])
+        self.bottom_right_x = nn.Embedding(
+            config['max_2d_position_embeddings'], config['hidden_state'])
+        self.top_left_y = nn.Embedding(
+            config['max_2d_position_embeddings'], config['hidden_state'])
+        self.bottom_right_y = nn.Embedding(
+            config['max_2d_position_embeddings'], config['hidden_state'])
+        self.width_emb = nn.Embedding(
+            config['max_2d_position_embeddings'], config['hidden_state'])
+        self.height_emb = nn.Embedding(
+            config['max_2d_position_embeddings'], config['hidden_state'])
+        self.classify = classify
+        self.classification_layer = nn.Linear(
+            config['hidden_state'], config['classes'])
+    def forward(self, tokens, coordinates, predict_proba=False, predict_class=False):
+        batch_size = len(tokens)
+        embeded_feature = self.language_emb(tokens)
+        top_left_x_feat = self.top_left_x(coordinates[:, :, 0])
+        top_left_y_feat = self.top_left_y(coordinates[:, :, 1])
+        bottom_right_x_feat = self.bottom_right_x(coordinates[:, :, 2])
+        bottom_right_y_feat = self.bottom_right_y(coordinates[:, :, 3])
+        width_feat = self.width_emb(coordinates[:, :, 4])
+        height_feat = self.height_emb(coordinates[:, :, 5])
+        total_feat = embeded_feature + top_left_x_feat + top_left_y_feat + \
+            bottom_right_x_feat + bottom_right_y_feat + width_feat + height_feat
+        # Extracting the feature
+        for layer in self.list_encoder:
+            total_feat = layer(total_feat)[0]
+        total_feat = self.residue_encoder(total_feat)
+        for layer in self.list_decoder:
+            total_feat = layer(total_feat)[0]
+        total_feat = self.residue_decoder(total_feat)
+        if self.classify:
+            total_feat = self.classification_layer(total_feat)
+        if predict_proba:
+            return total_feat.softmax(axis=-1)
+        if predict_class:
+            return total_feat.argmax(axis=-1)
+        return total_feat
+class LaTr_for_finetuning(nn.Module):
+    def __init__(self, config, address_to_pre_trained_weights=None):
+        super(LaTr_for_finetuning, self).__init__()
+        self.config = config
+        self.vocab_size = config['vocab_size']
+        self.pre_training_model = LaTr_for_pretraining(config)
+        if address_to_pre_trained_weights is not None:
+            self.pre_training_model.load_state_dict(
+                torch.load(address_to_pre_trained_weights))
+        self.vit = ViTModel.from_pretrained(
+            "google/vit-base-patch16-224-in21k")
+        # In the fine-tuning stage of vit, except the last layer, all the layers were freezed
+        self.classification_head = nn.Linear(
+            config['hidden_state'], config['classes'])
+    def forward(self, lang_vect, spatial_vect, quest_vect, img_vect):
+        # The below block of code calculates the language and spatial featuer
+        embeded_feature = self.pre_training_model.language_emb(lang_vect)
+        top_left_x_feat = self.pre_training_model.top_left_x(
+            spatial_vect[:, :, 0])
+        top_left_y_feat = self.pre_training_model.top_left_y(
+            spatial_vect[:, :, 1])
+        bottom_right_x_feat = self.pre_training_model.bottom_right_x(
+            spatial_vect[:, :, 2])
+        bottom_right_y_feat = self.pre_training_model.bottom_right_y(
+            spatial_vect[:, :, 3])
+        width_feat = self.pre_training_model.width_emb(spatial_vect[:, :, 4])
+        height_feat = self.pre_training_model.height_emb(spatial_vect[:, :, 5])
+        spatial_lang_feat = embeded_feature + top_left_x_feat + top_left_y_feat + \
+            bottom_right_x_feat + bottom_right_y_feat + width_feat + height_feat
+        # Extracting the image feature, using the Vision Transformer
+        img_feat = self.vit(img_vect).last_hidden_state
+        # Extracting the question vector
+        quest_feat = self.pre_training_model.language_emb(quest_vect)
+        # Concating the three features, and then passing it through the T5 Transformer
+        final_feat = torch.cat(
+            [img_feat, spatial_lang_feat, quest_feat], axis=-2)
+        # Passing through the T5 Transformer
+        for layer in self.pre_training_model.list_encoder:
+            final_feat = layer(final_feat)[0]
+        final_feat = self.pre_training_model.residue_encoder(final_feat)
+        for layer in self.pre_training_model.list_decoder:
+            final_feat = layer(final_feat)[0]
+        final_feat = self.pre_training_model.residue_decoder(final_feat)
+        answer_vector = self.classification_head(
+            final_feat)[:, :self.config['seq_len'], :]
+        return answer_vector
+def polynomial(base_lr, iter, max_iter=1e5, power=1):
+    return base_lr * ((1 - float(iter) / max_iter) ** power)
+class LaTrForVQA(pl.LightningModule):
+    def __init__(self, config, learning_rate=1e-4, max_steps=100000//2):
+        super(LaTrForVQA, self).__init__()
+        self.config = config
+        self.save_hyperparameters()
+        self.latr = LaTr_for_finetuning(config)
+        self.training_losses = []
+        self.validation_losses = []
+        self.max_steps = max_steps
+    def configure_optimizers(self):
+        return torch.optim.AdamW(self.parameters(), lr=self.hparams['learning_rate'])
+    def forward(self, batch_dict):
+        boxes = batch_dict['boxes']
+        img = batch_dict['img']
+        question = batch_dict['question']
+        words = batch_dict['tokenized_words']
+        answer_vector = self.latr(lang_vect=words,
+                                  spatial_vect=boxes,
+                                  img_vect=img,
+                                  quest_vect=question
+                                  )
+        return answer_vector
+    def calculate_metrics(self, prediction, labels):
+        # Calculate the accuracy score between the prediction and ground label for a batch, with considering the pad sequence
+        batch_size = len(prediction)
+        ac_score = 0
+        for (pred, gt) in zip(prediction, labels):
+            ac_score += calculate_acc_score(pred.detach().cpu(),
+                                            gt.detach().cpu())
+        ac_score = ac_score/batch_size
+        return ac_score
+    def training_step(self, batch, batch_idx):
+        answer_vector = self.forward(batch)
+        # https://discuss.huggingface.co/t/bertformaskedlm-s-loss-and-scores-how-the-loss-is-computed/607/2
+        loss = nn.CrossEntropyLoss(ignore_index=0)(
+            answer_vector.reshape(-1, self.config['classes']), batch['answer'].reshape(-1))
+        _, preds = torch.max(answer_vector, dim=-1)
+        # Calculating the accuracy score
+        train_acc = self.calculate_metrics(preds, batch['answer'])
+        train_acc = torch.tensor(train_acc)
+        # Logging
+        self.log('train_ce_loss', loss, prog_bar=True)
+        self.log('train_acc', train_acc, prog_bar=True)
+        self.training_losses.append(loss.item())
+        return loss
+    def validation_step(self, batch, batch_idx):
+        logits = self.forward(batch)
+        loss = nn.CrossEntropyLoss(ignore_index=0)(
+            logits.reshape(-1, self.config['classes']), batch['answer'].reshape(-1))
+        _, preds = torch.max(logits, dim=-1)
+        # Validation Accuracy
+        val_acc = self.calculate_metrics(preds.cpu(), batch['answer'].cpu())
+        val_acc = torch.tensor(val_acc)
+        # Logging
+        self.log('val_ce_loss', loss, prog_bar=True)
+        self.log('val_acc', val_acc, prog_bar=True)
+        self.validation_losses.append(loss.item())
+        return {'val_loss': loss, 'val_acc': val_acc}
+    def optimizer_step(self, epoch_nb, batch_nb, optimizer, optimizer_i, opt_closure=None, on_tpu=False,
+                       using_native_amp=False, using_lbfgs=False):
+        # Warmup for 1000 steps
+        if self.trainer.global_step < 1000:
+            lr_scale = min(1., float(self.trainer.global_step + 1) / 1000.)
+            for pg in optimizer.param_groups:
+                pg['lr'] = lr_scale * self.hparams.learning_rate
+        # Linear Decay
+        else:
+            for pg in optimizer.param_groups:
+                pg['lr'] = polynomial(
+                    self.hparams.learning_rate, self.trainer.global_step, max_iter=self.max_steps)
+        optimizer.step(opt_closure)
+        optimizer.zero_grad()
+    def validation_epoch_end(self, outputs):
+        val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
+        val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
+        self.log('val_loss_epoch_end', val_loss, on_epoch=True, sync_dist=True)
+        self.log('val_acc_epoch_end', val_acc, on_epoch=True, sync_dist=True)

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ tesseract-ocr

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+transformers
+sentencepiece==0.1.91
+pytesseract
+pillow
+Pillow==7.1.2
+pytorch-lightning
+gradio
+torchvision
+scikit-learn
+pandas
+matplotlib
+seaborn
+numpy
+torch
+einops
+tqdm

utils.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# import random
+import torch
+import math
+from torch.nn.utils.rnn import pad_sequence
+def find_pad_idx(boxes):
+  for i, j in enumerate(boxes):
+    if int(boxes[i].sum().item()) == 0:
+      return i
+  return i
+# def apply_mask_on_token_bbox(boxes, tokenized_words, only_actual_words = False, span = 4, proportion_to_mask = 0.15, special_token = 103):
+  # '''
+  # code taken from here: https://www.geeksforgeeks.org/python-non-overlapping-random-ranges/
+  # Note: A more robust solution is to be coded
+  # '''
+  # length_to_be_masked = int(proportion_to_mask*len(boxes))
+  # if only_actual_words:
+  #   tot = find_pad_idx(tokenized_words)
+  # else:
+  #   tot = len(boxes)
+  # res = set()
+  # for _ in range(length_to_be_masked):
+  #   temp = random.randint(0, tot - span)
+  #   while any(((temp >= idx) and (temp <= idx + span)) for idx in res):
+  #     temp = random.randint(0, tot - span)
+  #   res.add(temp)
+  #   ## Applying the mask on token
+  #   tokenized_words[temp] = special_token
+  #   ## Applying the masking on the box
+  #   boxes[temp, 0] = torch.min(boxes[temp: temp+span, 0])
+  #   boxes[temp, 1] = torch.min(boxes[temp: temp+span, 1])
+  #   boxes[temp, 2] = torch.max(boxes[temp: temp+span, 2])
+  #   boxes[temp, 3] = torch.max(boxes[temp: temp+span, 3])
+  #   boxes[temp, 4] = boxes[temp, 2] - boxes[temp, 0]
+  #   boxes[temp, 5] = boxes[temp, 3] - boxes[temp, 1]
+  # return res,boxes, tokenized_words
+def convert_ans_to_token(answer, label2id, max_seq_length = 512 ):
+  ## Simple Trick to pad a sequence to deired length
+  dummy_array = torch.zeros(max_seq_length)
+  actual_ans_array = []
+  answer = answer.split(" ")
+  for token in answer:
+    actual_ans_array.append(label2id[token]['id'])
+  actual_ans_array = torch.tensor(actual_ans_array, dtype = torch.int32)
+  actual_ans_array = pad_sequence([actual_ans_array,dummy_array], batch_first  = True)[0]
+  return actual_ans_array
+def convert_ques_to_token(question, tokenizer, pad_token_id = 0, max_seq_len = 512):
+  question_array = []
+  question = question.split(" ")
+  for token in question:
+    question_array.extend(tokenizer(token, add_special_tokens = False).input_ids)
+  if len(question_array)< max_seq_len:
+        question_array.extend([pad_token_id]* (max_seq_len-len(question_array)))
+  question_array = torch.tensor(question_array, dtype = torch.int32)
+  return question_array[:max_seq_len]
+## To be taken from here
+## https://logicatcore.github.io/scratchpad/lidar/sensor-fusion/jupyter/2021/04/20/3D-Oriented-Bounding-Box.html
+def rotate(origin, point, angle):
+    """
+    Rotate a point counterclockwise by a given angle around a given origin.
+    The angle should be given in radians.
+    modified from answer here: https://stackoverflow.com/questions/34372480/rotate-point-about-another-point-in-degrees-python
+    """
+    # angle = np.deg2rad(angle)
+    ox, oy = origin
+    px, py = point
+    qx = ox + math.cos(angle) * (px - ox) - math.sin(angle) * (py - oy)
+    qy = oy + math.sin(angle) * (px - ox) + math.cos(angle) * (py - oy)
+    return int(qx), int(qy)
+def convert_token_to_ques(ques, tokenizer):
+  decoded_ques = tokenizer.decode(ques, skip_special_tokens=True)
+  return decoded_ques
+def convert_token_to_answer(ans, id2label):
+  non_zero_argument = torch.nonzero(ans,as_tuple = False).view(-1)
+  actual_answer = ans[non_zero_argument].cpu().numpy()
+  decoded_answer = []
+  for token in actual_answer:
+    decoded_answer.append(id2label[token])
+  decoded_answer = " ".join(decoded_answer)
+  return decoded_answer