# Requirements.txt from torch import cuda from transformers import T5Tokenizer, T5ForConditionalGeneration import gradio as gr from utils import convert_ans_to_token, convert_ques_to_token, rotate, convert_token_to_ques, convert_token_to_answer from modeling import LaTr_for_pretraining, LaTr_for_finetuning, LaTrForVQA from dataset import load_json_file, get_specific_file, resize_align_bbox, get_tokens_with_boxes, create_features import torch.nn as nn from PIL import Image, ImageDraw import pytesseract from tqdm.auto import tqdm import numpy as np import json import os import torch from torchvision import transforms # install PyTesseract os.system('pip install -q pytesseract') os.environ["TOKENIZERS_PARALLELISM"] = "false" # Default Library import # Visualization libraries # Specific libraries of LaTr # Setting the hyperparameters as well as primary configurations PAD_TOKEN_BOX = [0, 0, 0, 0] max_seq_len = 512 batch_size = 2 target_size = (500, 384) t5_model = "t5-base" device = 'cuda' if cuda.is_available() else 'cpu' # Configuration for the model config = { 't5_model': 't5-base', 'vocab_size': 32128, 'hidden_state': 768, 'max_2d_position_embeddings': 1001, 'classes': 32128, # number of tokens 'seq_len': 512 } tokenizer = T5Tokenizer.from_pretrained(t5_model) latr = LaTrForVQA(config) url = 'https://www.kaggleusercontent.com/kf/99663112/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..5-IY5sqV-Y5lb7On3LOjMg._mvffzQwAyb-JSgwqhyxcjz3clhuAIwZEep4DA0CEao2LVjijahLYK9Co6yYVbdaEVk8CVqIGCx-_08XSdcsYnkt4HzCxI6zCI6Rv9_PhHITzTCZPC4juNgsmbb3ebu2eu5kJxUGsQvikk6efkpNoXFhPS5XV-Pqx_9wfxDyRJCJ1hzSxtiZcnsobKfoQt6F2w09NWGT45ePd_UlQNloogUD6icJDSWvyLvXHaVryKPGhy3q0_yaVheoBqflipUcUb1Q7q8wRDYbA3Kg_pAJzuyfPGhEp1WUEVt9gMXO1IIUCQbiygZRdGpKZBJwDx2LylLD3NwKMqv_maUknV0pCRhES45pFpuXv0X8ITGcr8DtGeLBIa9ZHW-eUEXETZnFdJqj6lU32IEyjJBhx1nNC_w6-0AGgH9ZC2c54sxUtmfOHmB9AhjYAmXi7Nmr2mQpDTBgrlPCQmNFLJ8GPWP0G6cDAgvZryVyFUm2z7SEcUzzLH6jHyr48ggGJBikNxZ4WL3W7L-zx_6v8BQBxBUp2KcZFzrfaXO1uoY2EyD3Y4ynTEUuEncS-UdRczCZCz6PqViyHJLycMnQteTw0j0ivEsLOlJkADufPX11f8ScVadd1YU-824nD6D5Kc16DRy0z1fHl1ZouI6Ahp3wY3AT-CR5te9kvYJUn_ggjvsm4d8CYc1qI6i1lfrNeeBxXCaK.dhOQv7UopiggmdGfsp-xmQ/models/epoch=0-step=34602.ckpt' try: latr = latr.load_from_checkpoint(url) print("Checkpoint loaded successfully") except: print("Checkpoint not loaded") pass image = gr.inputs.Image(type="pil") question = gr.inputs.Textbox(label="Question") answer = gr.outputs.Textbox(label="Predicted answer") examples = [["remote.jpg", "what number is the button near the top left?"]] from transformers import ViTFeatureExtractor, ViTModel vit_feat_extract = ViTFeatureExtractor("google/vit-base-patch16-224-in21k") import torchvision import numpy as np def answer_question(image, question): # Extracting features from the image image.save("sample.png") img, boxes, tokenized_words = create_features("sample.png", tokenizer=tokenizer, target_size=target_size, max_seq_length=max_seq_len, use_ocr=True ) ## Converting the boxes as per the format required for model input boxes = torch.as_tensor(boxes, dtype=torch.int32) width = (boxes[:, 2] - boxes[:, 0]).view(-1, 1) height = (boxes[:, 3] - boxes[:, 1]).view(-1, 1) boxes = torch.cat([boxes, width, height], axis = -1) ## Clamping the value,as some of the box values are out of bound boxes[:, 0] = torch.clamp(boxes[:, 0], min = 0, max = 0) boxes[:, 2] = torch.clamp(boxes[:, 2], min = 1000, max = 1000) boxes[:, 4] = torch.clamp(boxes[:, 4], min = 1000, max = 1000) boxes[:, 1] = torch.clamp(boxes[:, 1], min = 0, max = 0) boxes[:, 3] = torch.clamp(boxes[:, 3], min = 1000, max = 1000) boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000) ## Tensor tokenized words tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32) img = np.array(img) img = torchvision.transforms.ToTensor()(img) question = convert_ques_to_token(question = question, tokenizer = tokenizer) ## Expanding the dimension for inference boxes = boxes.unsqueeze(0) tokenized_words = tokenized_words.unsqueeze(0) question = question.unsqueeze(0) # print("Shape of Image is:", img.shape) img = vit_feat_extract(img, return_tensors = 'pt')['pixel_values'] if int(len(img.shape)) == 3: img = img.unsqueeze(0) encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question} with torch.no_grad(): logits = latr.forward(encoding) logits = logits.squeeze(0) _, preds = torch.max(logits, dim = 1) preds = preds.detach().cpu() mask = torch.clamp(preds, min = 0, max = 1) last_non_zero_argument = (mask != 0).nonzero()[1][-1] predicted_ans = convert_token_to_ques(preds[:last_non_zero_argument], tokenizer) return predicted_ans # Taken from here: https://huggingface.co/spaces/nielsr/vilt-vqa/blob/main/app.py title = "Interactive demo: LaTr (Layout Aware Transformer) for VQA" description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below." article = "

LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA) | Github Repo

" examples = [['remote.png', "Is remote present in the picture?"]] interface = gr.Interface(fn=answer_question, inputs=[image, question], outputs=answer, examples=examples, title=title, description=description, article=article, enable_queue=True) interface.launch(debug=True)