""" Download the weights in ./checkpoints beforehand for fast inference wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth """ from pathlib import Path from PIL import Image import torch from torchvision import transforms from torchvision.transforms.functional import InterpolationMode import cog from models.blip import blip_decoder from models.blip_vqa import blip_vqa from models.blip_itm import blip_itm class Predictor(cog.Predictor): def setup(self): self.device = "cuda:0" self.models = { 'image_captioning': blip_decoder(pretrained='checkpoints/model*_base_caption.pth', image_size=384, vit='base'), 'visual_question_answering': blip_vqa(pretrained='checkpoints/model*_vqa.pth', image_size=480, vit='base'), 'image_text_matching': blip_itm(pretrained='checkpoints/model_base_retrieval_coco.pth', image_size=384, vit='base') } @cog.input( "image", type=Path, help="input image", ) @cog.input( "task", type=str, default='image_captioning', options=['image_captioning', 'visual_question_answering', 'image_text_matching'], help="Choose a task.", ) @cog.input( "question", type=str, default=None, help="Type question for the input image for visual question answering task.", ) @cog.input( "caption", type=str, default=None, help="Type caption for the input image for image text matching task.", ) def predict(self, image, task, question, caption): if task == 'visual_question_answering': assert question is not None, 'Please type a question for visual question answering task.' if task == 'image_text_matching': assert caption is not None, 'Please type a caption for mage text matching task.' im = load_image(image, image_size=480 if task == 'visual_question_answering' else 384, device=self.device) model = self.models[task] model.eval() model = model.to(self.device) if task == 'image_captioning': with torch.no_grad(): caption = model.generate(im, sample=False, num_beams=3, max_length=20, min_length=5) return 'Caption: ' + caption[0] if task == 'visual_question_answering': with torch.no_grad(): answer = model(im, question, train=False, inference='generate') return 'Answer: ' + answer[0] # image_text_matching itm_output = model(im, caption, match_head='itm') itm_score = torch.nn.functional.softmax(itm_output, dim=1)[:, 1] itc_score = model(im, caption, match_head='itc') return f'The image and text is matched with a probability of {itm_score.item():.4f}.\n' \ f'The image feature and text feature has a cosine similarity of {itc_score.item():.4f}.' def load_image(image, image_size, device): raw_image = Image.open(str(image)).convert('RGB') w, h = raw_image.size transform = transforms.Compose([ transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), transforms.ToTensor(), transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) ]) image = transform(raw_image).unsqueeze(0).to(device) return image