uakarsh commited on
Commit
c017f2e
1 Parent(s): 31dbc87

Add application file

Browse files
Files changed (6) hide show
  1. app.py +148 -0
  2. dataset.py +150 -0
  3. modeling.py +251 -0
  4. packages.txt +1 -0
  5. requirements.txt +16 -0
  6. utils.py +116 -0
app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requirements.txt
2
+ from torch import cuda
3
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
4
+ import gradio as gr
5
+ from utils import convert_ans_to_token, convert_ques_to_token, rotate, convert_token_to_ques, convert_token_to_answer
6
+ from modeling import LaTr_for_pretraining, LaTr_for_finetuning, LaTrForVQA
7
+ from dataset import load_json_file, get_specific_file, resize_align_bbox, get_tokens_with_boxes, create_features
8
+ import torch.nn as nn
9
+ from PIL import Image, ImageDraw
10
+ import pytesseract
11
+ from tqdm.auto import tqdm
12
+ import numpy as np
13
+ import json
14
+ import os
15
+ import torch
16
+ from torchvision import transforms
17
+
18
+
19
+ # install PyTesseract
20
+ os.system('pip install -q pytesseract')
21
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
22
+
23
+
24
+ # Default Library import
25
+ # Visualization libraries
26
+
27
+ # Specific libraries of LaTr
28
+
29
+ # Setting the hyperparameters as well as primary configurations
30
+
31
+ PAD_TOKEN_BOX = [0, 0, 0, 0]
32
+ max_seq_len = 512
33
+ batch_size = 2
34
+ target_size = (500, 384)
35
+ t5_model = "t5-base"
36
+
37
+
38
+ device = 'cuda' if cuda.is_available() else 'cpu'
39
+
40
+
41
+ # Configuration for the model
42
+ config = {
43
+ 't5_model': 't5-base',
44
+ 'vocab_size': 32128,
45
+ 'hidden_state': 768,
46
+ 'max_2d_position_embeddings': 1001,
47
+ 'classes': 32128, # number of tokens
48
+ 'seq_len': 512
49
+ }
50
+
51
+ tokenizer = T5Tokenizer.from_pretrained(t5_model)
52
+ latr = LaTrForVQA(config)
53
+ url = 'https://www.kaggleusercontent.com/kf/99663112/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..2HGa6jqeAbugMJYxSkh7eA.XkaLSf8XlITet17Bscupegw9zWLw-IEizSy1lM-_PJF_Gfj-YuinOpDw4ad0M8r-s3WlnclQhHYrd2seaZVjBmkm5WSE6Dae1fW54dnNhyWF5w5O2VafNar7QSuUTSRzacJcmtqI1ypL3OZofwXuETbXq4weeqfDptFS5luxuV0P4Vaer_xEgfsdld6v8O5jjMXwb1CVmPCjMdZUE-HTgzTDiwv3Lb-P3dkRgU7q-iI5GeYZCODYGrX-koxya9DlfzKQZXmJmvtMj45vUZ8OSRB0_hTc7UosQanA-SalWznnOuyOgwl4hMag5toTomriWsxfvJIRBn9CYgFcvUJNqO_kDzBUoAwnagjcxXeEIJTJglwAl9Rs37XyfJAZr7yQ_YTXeRW1j2QMsT_M3qtS96IKRTpsqPVibl8Vrs9Q5g_vKccIQR9t7R9ma_DZLwjWYhDvDO06AZqtdaYGfWaOrbqe8dDvJkZoHsZEO8ukpIH6YNLyCO_dqgRsE77I9jqxiUqQh1KnuNv2hGRSlQR7u8OF7lpiRS7JEwj2MaxlzD58dyhOOLDqrbLp7XWrgV79EQcRYHFSMfhDvG0zmGvHjWGAg-LGhnYIc0NMVhyRv5Pfta9WYEl4qXxCTZWe4olgV79WHLqksQMVyTteheB36n4biHZKx4KZj7k-j3aSI72DIAvj7_UFeHxUTTZ1c6MB.7BF6J5MPMuhQFU48xVZ2qQ/models/epoch=0-step=34602.ckpt'
54
+
55
+
56
+
57
+ try:
58
+ latr = latr.load_from_checkpoint(url)
59
+ print("Checkpoint loaded successfully")
60
+ except:
61
+ print("Checkpoint not loaded")
62
+ pass
63
+
64
+
65
+ image = gr.inputs.Image(type="pil")
66
+ question = gr.inputs.Textbox(label="Question")
67
+ answer = gr.outputs.Textbox(label="Predicted answer")
68
+ examples = [["remote.jpg", "what number is the button near the top left?"]]
69
+
70
+
71
+ from transformers import ViTFeatureExtractor, ViTModel
72
+ vit_feat_extract = ViTFeatureExtractor("google/vit-base-patch16-224-in21k")
73
+
74
+ import torchvision
75
+ import numpy as np
76
+
77
+ def answer_question(image, question):
78
+
79
+ # Extracting features from the image
80
+ image.save("sample.png")
81
+ img, boxes, tokenized_words = create_features("sample.png",
82
+ tokenizer=tokenizer,
83
+ target_size=target_size,
84
+ max_seq_length=max_seq_len,
85
+ use_ocr=True
86
+ )
87
+
88
+ ## Converting the boxes as per the format required for model input
89
+ boxes = torch.as_tensor(boxes, dtype=torch.int32)
90
+ width = (boxes[:, 2] - boxes[:, 0]).view(-1, 1)
91
+ height = (boxes[:, 3] - boxes[:, 1]).view(-1, 1)
92
+ boxes = torch.cat([boxes, width, height], axis = -1)
93
+
94
+ ## Clamping the value,as some of the box values are out of bound
95
+ boxes[:, 0] = torch.clamp(boxes[:, 0], min = 0, max = 0)
96
+ boxes[:, 2] = torch.clamp(boxes[:, 2], min = 1000, max = 1000)
97
+ boxes[:, 4] = torch.clamp(boxes[:, 4], min = 1000, max = 1000)
98
+
99
+ boxes[:, 1] = torch.clamp(boxes[:, 1], min = 0, max = 0)
100
+ boxes[:, 3] = torch.clamp(boxes[:, 3], min = 1000, max = 1000)
101
+ boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000)
102
+
103
+ ## Tensor tokenized words
104
+ tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32)
105
+ img = np.array(img)
106
+ img = torchvision.transforms.ToTensor()(img)
107
+ question = convert_ques_to_token(question = question, tokenizer = tokenizer)
108
+
109
+ ## Expanding the dimension for inference
110
+ boxes = boxes.unsqueeze(0)
111
+ tokenized_words = tokenized_words.unsqueeze(0)
112
+ question = question.unsqueeze(0)
113
+
114
+ # print("Shape of Image is:", img.shape)
115
+ img = vit_feat_extract(img, return_tensors = 'pt')['pixel_values']
116
+ if int(len(img.shape)) == 3:
117
+ img = img.unsqueeze(0)
118
+
119
+ encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question}
120
+
121
+ with torch.no_grad():
122
+ logits = latr.forward(encoding)
123
+ logits = logits.squeeze(0)
124
+
125
+ _, preds = torch.max(logits, dim = 1)
126
+ preds = preds.detach().cpu()
127
+ mask = torch.clamp(preds, min = 0, max = 1)
128
+ last_non_zero_argument = (mask != 0).nonzero()[1][-1]
129
+
130
+ predicted_ans = convert_token_to_ques(preds[:last_non_zero_argument], tokenizer)
131
+ return predicted_ans
132
+
133
+
134
+ # Taken from here: https://huggingface.co/spaces/nielsr/vilt-vqa/blob/main/app.py
135
+ title = "Interactive demo: LaTr (Layout Aware Transformer) for VQA"
136
+ description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
137
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.12494' target='_blank'>LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA)</a> | <a href='https://github.com/uakarsh/latr' target='_blank'>Github Repo</a></p>"
138
+ examples = [['remote.png', "Is remote present in the picture?"]]
139
+
140
+ interface = gr.Interface(fn=answer_question,
141
+ inputs=[image, question],
142
+ outputs=answer,
143
+ examples=examples,
144
+ title=title,
145
+ description=description,
146
+ article=article,
147
+ enable_queue=True)
148
+ interface.launch(debug=True)
dataset.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import numpy as np
4
+ import pytesseract
5
+ from PIL import Image, ImageDraw
6
+
7
+ PAD_TOKEN_BOX = [0, 0, 0, 0]
8
+ max_seq_len = 512
9
+
10
+ ## Function: 1
11
+ ## Purpose: Resize and align the bounding box for the different sized image
12
+
13
+ def resize_align_bbox(bbox, orig_w, orig_h, target_w, target_h):
14
+ x_scale = target_w / orig_w
15
+ y_scale = target_h / orig_h
16
+ orig_left, orig_top, orig_right, orig_bottom = bbox
17
+ x = int(np.round(orig_left * x_scale))
18
+ y = int(np.round(orig_top * y_scale))
19
+ xmax = int(np.round(orig_right * x_scale))
20
+ ymax = int(np.round(orig_bottom * y_scale))
21
+ return [x, y, xmax, ymax]
22
+
23
+ ## Function: 2
24
+ ## Purpose: Reading the json file from the path and return the dictionary
25
+
26
+ def load_json_file(file_path):
27
+ with open(file_path, 'r') as f:
28
+ data = json.load(f)
29
+ return data
30
+
31
+ ## Function: 3
32
+ ## Purpose: Getting the address of specific file type, eg: .pdf, .tif, so and so
33
+
34
+ def get_specific_file(path, last_entry = 'tif'):
35
+ base_path = path
36
+ for i in os.listdir(path):
37
+ if i.endswith(last_entry):
38
+ return os.path.join(base_path, i)
39
+
40
+ return '-1'
41
+
42
+
43
+ ## Function: 4
44
+
45
+
46
+ def get_tokens_with_boxes(unnormalized_word_boxes, list_of_words, tokenizer, pad_token_id = 0, pad_token_box = [0, 0, 0, 0], max_seq_len = 512):
47
+
48
+ '''
49
+ This function returns two items:
50
+ 1. unnormalized_token_boxes -> a list of len = max_seq_len, containing the boxes corresponding to the tokenized words,
51
+ one box might repeat as per the tokenization procedure
52
+ 2. tokenized_words -> tokenized words corresponding to the tokenizer and the list_of_words
53
+ '''
54
+
55
+ assert len(unnormalized_word_boxes) == len(list_of_words), "Bounding box length!= total words length"
56
+
57
+ length_of_box = len(unnormalized_word_boxes)
58
+ unnormalized_token_boxes = []
59
+ tokenized_words = []
60
+
61
+ for box, word in zip(unnormalized_word_boxes, list_of_words):
62
+ current_tokens = tokenizer(word, add_special_tokens = False).input_ids
63
+ unnormalized_token_boxes.extend([box]*len(current_tokens))
64
+ tokenized_words.extend(current_tokens)
65
+
66
+ if len(unnormalized_token_boxes)<max_seq_len:
67
+ unnormalized_token_boxes.extend([pad_token_box] * (max_seq_len-len(unnormalized_token_boxes)))
68
+
69
+ if len(tokenized_words)< max_seq_len:
70
+ tokenized_words.extend([pad_token_id]* (max_seq_len-len(tokenized_words)))
71
+
72
+ return unnormalized_token_boxes[:max_seq_len], tokenized_words[:max_seq_len]
73
+
74
+ ## Function: 5
75
+ ## Function, which would only be used when the below function is used
76
+
77
+ def get_topleft_bottomright_coordinates(df_row):
78
+ left, top, width, height = df_row["left"], df_row["top"], df_row["width"], df_row["height"]
79
+ return [left, top, left + width, top + height]
80
+
81
+ ## Function: 6
82
+ ## If the OCR is not provided, this function would help in extracting OCR
83
+
84
+
85
+ def apply_ocr(tif_path):
86
+ """
87
+ Returns words and its bounding boxes from an image
88
+ """
89
+ img = Image.open(tif_path).convert("RGB")
90
+
91
+ ocr_df = pytesseract.image_to_data(img, output_type="data.frame")
92
+ ocr_df = ocr_df.dropna().reset_index(drop=True)
93
+ float_cols = ocr_df.select_dtypes("float").columns
94
+ ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
95
+ ocr_df = ocr_df.replace(r"^\s*$", np.nan, regex=True)
96
+ ocr_df = ocr_df.dropna().reset_index(drop=True)
97
+ words = list(ocr_df.text.apply(lambda x: str(x).strip()))
98
+ actual_bboxes = ocr_df.apply(get_topleft_bottomright_coordinates, axis=1).values.tolist()
99
+
100
+ # add as extra columns
101
+ assert len(words) == len(actual_bboxes)
102
+ return {"words": words, "bbox": actual_bboxes}
103
+
104
+
105
+ ## Function: 7
106
+ ## Merging all the above functions, for the purpose of extracting the image, bounding box and the tokens (sentence wise)
107
+
108
+
109
+ def create_features(
110
+ image_path,
111
+ tokenizer,
112
+ target_size = (1000, 1000),
113
+ max_seq_length=512,
114
+ use_ocr = False,
115
+ bounding_box = None,
116
+ words = None
117
+ ):
118
+
119
+ '''
120
+ We assume that the bounding box provided are given as per the image scale (i.e not normalized), so that we just need to scale it as per the ratio
121
+ '''
122
+
123
+
124
+ img = Image.open(image_path).convert("RGB")
125
+ width_old, height_old = img.size
126
+ img = img.resize(target_size)
127
+ width, height = img.size
128
+
129
+ ## Rescaling the bounding box as per the image size
130
+
131
+
132
+ if (use_ocr == False) and (bounding_box == None or words == None):
133
+ raise Exception('Please provide the bounding box and words or pass the argument "use_ocr" = True')
134
+
135
+ if use_ocr == True:
136
+ entries = apply_ocr(image_path)
137
+ bounding_box = entries["bbox"]
138
+ words = entries["words"]
139
+
140
+ bounding_box = list(map(lambda x: resize_align_bbox(x,width_old,height_old, width, height), bounding_box))
141
+ boxes, tokenized_words = get_tokens_with_boxes(unnormalized_word_boxes = bounding_box,
142
+ list_of_words = words,
143
+ tokenizer = tokenizer,
144
+ pad_token_id = 0,
145
+ pad_token_box = PAD_TOKEN_BOX,
146
+ max_seq_len = max_seq_length
147
+ )
148
+
149
+
150
+ return img, boxes, tokenized_words
modeling.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+ from transformers import T5ForConditionalGeneration, ViTModel
4
+
5
+ import pytorch_lightning as pl
6
+
7
+ # Defining the pytorch model
8
+
9
+
10
+ class LaTr_for_pretraining(nn.Module):
11
+ def __init__(self, config, classify=False):
12
+
13
+ super(LaTr_for_pretraining, self).__init__()
14
+ self.vocab_size = config['vocab_size']
15
+
16
+ model = T5ForConditionalGeneration.from_pretrained(config['t5_model'])
17
+ # Removing the Embedding layer
18
+ dummy_encoder = list(nn.Sequential(
19
+ *list(model.encoder.children())[1:]).children())
20
+ # Removing the Embedding Layer
21
+ dummy_decoder = list(nn.Sequential(
22
+ *list(model.decoder.children())[1:]).children())
23
+
24
+ # Using the T5 Encoder
25
+
26
+ self.list_encoder = nn.Sequential(*list(dummy_encoder[0]))
27
+ self.residue_encoder = nn.Sequential(*list(dummy_encoder[1:]))
28
+ self.list_decoder = nn.Sequential(*list(dummy_decoder[0]))
29
+ self.residue_decoder = nn.Sequential(*list(dummy_decoder[1:]))
30
+
31
+ # We use the embeddings of T5 for encoding the tokenized words
32
+ self.language_emb = nn.Embedding.from_pretrained(model.shared.weight)
33
+
34
+ self.top_left_x = nn.Embedding(
35
+ config['max_2d_position_embeddings'], config['hidden_state'])
36
+ self.bottom_right_x = nn.Embedding(
37
+ config['max_2d_position_embeddings'], config['hidden_state'])
38
+ self.top_left_y = nn.Embedding(
39
+ config['max_2d_position_embeddings'], config['hidden_state'])
40
+ self.bottom_right_y = nn.Embedding(
41
+ config['max_2d_position_embeddings'], config['hidden_state'])
42
+ self.width_emb = nn.Embedding(
43
+ config['max_2d_position_embeddings'], config['hidden_state'])
44
+ self.height_emb = nn.Embedding(
45
+ config['max_2d_position_embeddings'], config['hidden_state'])
46
+
47
+ self.classify = classify
48
+ self.classification_layer = nn.Linear(
49
+ config['hidden_state'], config['classes'])
50
+
51
+ def forward(self, tokens, coordinates, predict_proba=False, predict_class=False):
52
+
53
+ batch_size = len(tokens)
54
+ embeded_feature = self.language_emb(tokens)
55
+
56
+ top_left_x_feat = self.top_left_x(coordinates[:, :, 0])
57
+ top_left_y_feat = self.top_left_y(coordinates[:, :, 1])
58
+ bottom_right_x_feat = self.bottom_right_x(coordinates[:, :, 2])
59
+ bottom_right_y_feat = self.bottom_right_y(coordinates[:, :, 3])
60
+ width_feat = self.width_emb(coordinates[:, :, 4])
61
+ height_feat = self.height_emb(coordinates[:, :, 5])
62
+
63
+ total_feat = embeded_feature + top_left_x_feat + top_left_y_feat + \
64
+ bottom_right_x_feat + bottom_right_y_feat + width_feat + height_feat
65
+
66
+ # Extracting the feature
67
+
68
+ for layer in self.list_encoder:
69
+ total_feat = layer(total_feat)[0]
70
+ total_feat = self.residue_encoder(total_feat)
71
+
72
+ for layer in self.list_decoder:
73
+ total_feat = layer(total_feat)[0]
74
+ total_feat = self.residue_decoder(total_feat)
75
+
76
+ if self.classify:
77
+ total_feat = self.classification_layer(total_feat)
78
+
79
+ if predict_proba:
80
+ return total_feat.softmax(axis=-1)
81
+
82
+ if predict_class:
83
+ return total_feat.argmax(axis=-1)
84
+
85
+ return total_feat
86
+
87
+
88
+ class LaTr_for_finetuning(nn.Module):
89
+ def __init__(self, config, address_to_pre_trained_weights=None):
90
+ super(LaTr_for_finetuning, self).__init__()
91
+
92
+ self.config = config
93
+ self.vocab_size = config['vocab_size']
94
+
95
+ self.pre_training_model = LaTr_for_pretraining(config)
96
+ if address_to_pre_trained_weights is not None:
97
+ self.pre_training_model.load_state_dict(
98
+ torch.load(address_to_pre_trained_weights))
99
+ self.vit = ViTModel.from_pretrained(
100
+ "google/vit-base-patch16-224-in21k")
101
+
102
+ # In the fine-tuning stage of vit, except the last layer, all the layers were freezed
103
+
104
+ self.classification_head = nn.Linear(
105
+ config['hidden_state'], config['classes'])
106
+
107
+ def forward(self, lang_vect, spatial_vect, quest_vect, img_vect):
108
+
109
+ # The below block of code calculates the language and spatial featuer
110
+ embeded_feature = self.pre_training_model.language_emb(lang_vect)
111
+ top_left_x_feat = self.pre_training_model.top_left_x(
112
+ spatial_vect[:, :, 0])
113
+ top_left_y_feat = self.pre_training_model.top_left_y(
114
+ spatial_vect[:, :, 1])
115
+ bottom_right_x_feat = self.pre_training_model.bottom_right_x(
116
+ spatial_vect[:, :, 2])
117
+ bottom_right_y_feat = self.pre_training_model.bottom_right_y(
118
+ spatial_vect[:, :, 3])
119
+ width_feat = self.pre_training_model.width_emb(spatial_vect[:, :, 4])
120
+ height_feat = self.pre_training_model.height_emb(spatial_vect[:, :, 5])
121
+
122
+ spatial_lang_feat = embeded_feature + top_left_x_feat + top_left_y_feat + \
123
+ bottom_right_x_feat + bottom_right_y_feat + width_feat + height_feat
124
+
125
+ # Extracting the image feature, using the Vision Transformer
126
+ img_feat = self.vit(img_vect).last_hidden_state
127
+
128
+ # Extracting the question vector
129
+ quest_feat = self.pre_training_model.language_emb(quest_vect)
130
+
131
+ # Concating the three features, and then passing it through the T5 Transformer
132
+ final_feat = torch.cat(
133
+ [img_feat, spatial_lang_feat, quest_feat], axis=-2)
134
+
135
+ # Passing through the T5 Transformer
136
+ for layer in self.pre_training_model.list_encoder:
137
+ final_feat = layer(final_feat)[0]
138
+
139
+ final_feat = self.pre_training_model.residue_encoder(final_feat)
140
+
141
+ for layer in self.pre_training_model.list_decoder:
142
+ final_feat = layer(final_feat)[0]
143
+ final_feat = self.pre_training_model.residue_decoder(final_feat)
144
+
145
+ answer_vector = self.classification_head(
146
+ final_feat)[:, :self.config['seq_len'], :]
147
+
148
+ return answer_vector
149
+
150
+
151
+ def polynomial(base_lr, iter, max_iter=1e5, power=1):
152
+ return base_lr * ((1 - float(iter) / max_iter) ** power)
153
+
154
+
155
+ class LaTrForVQA(pl.LightningModule):
156
+ def __init__(self, config, learning_rate=1e-4, max_steps=100000//2):
157
+ super(LaTrForVQA, self).__init__()
158
+
159
+ self.config = config
160
+ self.save_hyperparameters()
161
+ self.latr = LaTr_for_finetuning(config)
162
+ self.training_losses = []
163
+ self.validation_losses = []
164
+ self.max_steps = max_steps
165
+
166
+ def configure_optimizers(self):
167
+ return torch.optim.AdamW(self.parameters(), lr=self.hparams['learning_rate'])
168
+
169
+ def forward(self, batch_dict):
170
+ boxes = batch_dict['boxes']
171
+ img = batch_dict['img']
172
+ question = batch_dict['question']
173
+ words = batch_dict['tokenized_words']
174
+ answer_vector = self.latr(lang_vect=words,
175
+ spatial_vect=boxes,
176
+ img_vect=img,
177
+ quest_vect=question
178
+ )
179
+ return answer_vector
180
+
181
+ def calculate_metrics(self, prediction, labels):
182
+
183
+ # Calculate the accuracy score between the prediction and ground label for a batch, with considering the pad sequence
184
+ batch_size = len(prediction)
185
+ ac_score = 0
186
+
187
+ for (pred, gt) in zip(prediction, labels):
188
+ ac_score += calculate_acc_score(pred.detach().cpu(),
189
+ gt.detach().cpu())
190
+ ac_score = ac_score/batch_size
191
+ return ac_score
192
+
193
+ def training_step(self, batch, batch_idx):
194
+ answer_vector = self.forward(batch)
195
+
196
+ # https://discuss.huggingface.co/t/bertformaskedlm-s-loss-and-scores-how-the-loss-is-computed/607/2
197
+ loss = nn.CrossEntropyLoss(ignore_index=0)(
198
+ answer_vector.reshape(-1, self.config['classes']), batch['answer'].reshape(-1))
199
+ _, preds = torch.max(answer_vector, dim=-1)
200
+
201
+ # Calculating the accuracy score
202
+ train_acc = self.calculate_metrics(preds, batch['answer'])
203
+ train_acc = torch.tensor(train_acc)
204
+
205
+ # Logging
206
+ self.log('train_ce_loss', loss, prog_bar=True)
207
+ self.log('train_acc', train_acc, prog_bar=True)
208
+ self.training_losses.append(loss.item())
209
+
210
+ return loss
211
+
212
+ def validation_step(self, batch, batch_idx):
213
+ logits = self.forward(batch)
214
+ loss = nn.CrossEntropyLoss(ignore_index=0)(
215
+ logits.reshape(-1, self.config['classes']), batch['answer'].reshape(-1))
216
+ _, preds = torch.max(logits, dim=-1)
217
+
218
+ # Validation Accuracy
219
+ val_acc = self.calculate_metrics(preds.cpu(), batch['answer'].cpu())
220
+ val_acc = torch.tensor(val_acc)
221
+
222
+ # Logging
223
+ self.log('val_ce_loss', loss, prog_bar=True)
224
+ self.log('val_acc', val_acc, prog_bar=True)
225
+ self.validation_losses.append(loss.item())
226
+ return {'val_loss': loss, 'val_acc': val_acc}
227
+
228
+ def optimizer_step(self, epoch_nb, batch_nb, optimizer, optimizer_i, opt_closure=None, on_tpu=False,
229
+ using_native_amp=False, using_lbfgs=False):
230
+
231
+ # Warmup for 1000 steps
232
+ if self.trainer.global_step < 1000:
233
+ lr_scale = min(1., float(self.trainer.global_step + 1) / 1000.)
234
+ for pg in optimizer.param_groups:
235
+ pg['lr'] = lr_scale * self.hparams.learning_rate
236
+
237
+ # Linear Decay
238
+ else:
239
+ for pg in optimizer.param_groups:
240
+ pg['lr'] = polynomial(
241
+ self.hparams.learning_rate, self.trainer.global_step, max_iter=self.max_steps)
242
+
243
+ optimizer.step(opt_closure)
244
+ optimizer.zero_grad()
245
+
246
+ def validation_epoch_end(self, outputs):
247
+ val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
248
+ val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
249
+
250
+ self.log('val_loss_epoch_end', val_loss, on_epoch=True, sync_dist=True)
251
+ self.log('val_acc_epoch_end', val_acc, on_epoch=True, sync_dist=True)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tesseract-ocr
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ sentencepiece==0.1.91
3
+ pytesseract
4
+ pillow
5
+ Pillow==7.1.2
6
+ pytorch-lightning
7
+ gradio
8
+ torchvision
9
+ scikit-learn
10
+ pandas
11
+ matplotlib
12
+ seaborn
13
+ numpy
14
+ torch
15
+ einops
16
+ tqdm
utils.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import random
2
+ import torch
3
+ import math
4
+ from torch.nn.utils.rnn import pad_sequence
5
+
6
+
7
+ def find_pad_idx(boxes):
8
+ for i, j in enumerate(boxes):
9
+ if int(boxes[i].sum().item()) == 0:
10
+ return i
11
+ return i
12
+
13
+
14
+
15
+ # def apply_mask_on_token_bbox(boxes, tokenized_words, only_actual_words = False, span = 4, proportion_to_mask = 0.15, special_token = 103):
16
+
17
+ # '''
18
+ # code taken from here: https://www.geeksforgeeks.org/python-non-overlapping-random-ranges/
19
+
20
+ # Note: A more robust solution is to be coded
21
+ # '''
22
+ # length_to_be_masked = int(proportion_to_mask*len(boxes))
23
+
24
+ # if only_actual_words:
25
+ # tot = find_pad_idx(tokenized_words)
26
+ # else:
27
+ # tot = len(boxes)
28
+
29
+ # res = set()
30
+ # for _ in range(length_to_be_masked):
31
+ # temp = random.randint(0, tot - span)
32
+ # while any(((temp >= idx) and (temp <= idx + span)) for idx in res):
33
+ # temp = random.randint(0, tot - span)
34
+ # res.add(temp)
35
+
36
+ # ## Applying the mask on token
37
+ # tokenized_words[temp] = special_token
38
+
39
+ # ## Applying the masking on the box
40
+ # boxes[temp, 0] = torch.min(boxes[temp: temp+span, 0])
41
+ # boxes[temp, 1] = torch.min(boxes[temp: temp+span, 1])
42
+ # boxes[temp, 2] = torch.max(boxes[temp: temp+span, 2])
43
+ # boxes[temp, 3] = torch.max(boxes[temp: temp+span, 3])
44
+ # boxes[temp, 4] = boxes[temp, 2] - boxes[temp, 0]
45
+ # boxes[temp, 5] = boxes[temp, 3] - boxes[temp, 1]
46
+
47
+ # return res,boxes, tokenized_words
48
+
49
+
50
+ def convert_ans_to_token(answer, label2id, max_seq_length = 512 ):
51
+
52
+ ## Simple Trick to pad a sequence to deired length
53
+ dummy_array = torch.zeros(max_seq_length)
54
+ actual_ans_array = []
55
+
56
+ answer = answer.split(" ")
57
+ for token in answer:
58
+ actual_ans_array.append(label2id[token]['id'])
59
+
60
+ actual_ans_array = torch.tensor(actual_ans_array, dtype = torch.int32)
61
+ actual_ans_array = pad_sequence([actual_ans_array,dummy_array], batch_first = True)[0]
62
+
63
+ return actual_ans_array
64
+
65
+
66
+ def convert_ques_to_token(question, tokenizer, pad_token_id = 0, max_seq_len = 512):
67
+
68
+ question_array = []
69
+ question = question.split(" ")
70
+
71
+ for token in question:
72
+ question_array.extend(tokenizer(token, add_special_tokens = False).input_ids)
73
+
74
+ if len(question_array)< max_seq_len:
75
+ question_array.extend([pad_token_id]* (max_seq_len-len(question_array)))
76
+
77
+ question_array = torch.tensor(question_array, dtype = torch.int32)
78
+ return question_array[:max_seq_len]
79
+
80
+
81
+ ## To be taken from here
82
+ ## https://logicatcore.github.io/scratchpad/lidar/sensor-fusion/jupyter/2021/04/20/3D-Oriented-Bounding-Box.html
83
+
84
+ def rotate(origin, point, angle):
85
+ """
86
+ Rotate a point counterclockwise by a given angle around a given origin.
87
+ The angle should be given in radians.
88
+
89
+ modified from answer here: https://stackoverflow.com/questions/34372480/rotate-point-about-another-point-in-degrees-python
90
+ """
91
+ # angle = np.deg2rad(angle)
92
+ ox, oy = origin
93
+ px, py = point
94
+
95
+ qx = ox + math.cos(angle) * (px - ox) - math.sin(angle) * (py - oy)
96
+ qy = oy + math.sin(angle) * (px - ox) + math.cos(angle) * (py - oy)
97
+ return int(qx), int(qy)
98
+
99
+
100
+ def convert_token_to_ques(ques, tokenizer):
101
+ decoded_ques = tokenizer.decode(ques, skip_special_tokens=True)
102
+ return decoded_ques
103
+
104
+
105
+ def convert_token_to_answer(ans, id2label):
106
+ non_zero_argument = torch.nonzero(ans,as_tuple = False).view(-1)
107
+
108
+ actual_answer = ans[non_zero_argument].cpu().numpy()
109
+ decoded_answer = []
110
+
111
+ for token in actual_answer:
112
+ decoded_answer.append(id2label[token])
113
+
114
+ decoded_answer = " ".join(decoded_answer)
115
+ return decoded_answer
116
+