a3c6b89051a7c02aa0bd0ae169d7a0a36641797f6750fa736a8a768ec5056573

cc8fd6b over 1 year ago

3.8 kB

	"""
	Download the weights in ./checkpoints beforehand for fast inference
	wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth
	wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth
	wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth
	"""

	from pathlib import Path

	from PIL import Image
	import torch
	from torchvision import transforms
	from torchvision.transforms.functional import InterpolationMode
	import cog

	from models.blip import blip_decoder
	from models.blip_vqa import blip_vqa
	from models.blip_itm import blip_itm


	class Predictor(cog.Predictor):
	def setup(self):
	self.device = "cuda:0"

	self.models = {
	'image_captioning': blip_decoder(pretrained='checkpoints/model*_base_caption.pth',
	image_size=384, vit='base'),
	'visual_question_answering': blip_vqa(pretrained='checkpoints/model*_vqa.pth',
	image_size=480, vit='base'),
	'image_text_matching': blip_itm(pretrained='checkpoints/model_base_retrieval_coco.pth',
	image_size=384, vit='base')
	}

	@cog.input(
	"image",
	type=Path,
	help="input image",
	)
	@cog.input(
	"task",
	type=str,
	default='image_captioning',
	options=['image_captioning', 'visual_question_answering', 'image_text_matching'],
	help="Choose a task.",
	)
	@cog.input(
	"question",
	type=str,
	default=None,
	help="Type question for the input image for visual question answering task.",
	)
	@cog.input(
	"caption",
	type=str,
	default=None,
	help="Type caption for the input image for image text matching task.",
	)
	def predict(self, image, task, question, caption):
	if task == 'visual_question_answering':
	assert question is not None, 'Please type a question for visual question answering task.'
	if task == 'image_text_matching':
	assert caption is not None, 'Please type a caption for mage text matching task.'

	im = load_image(image, image_size=480 if task == 'visual_question_answering' else 384, device=self.device)
	model = self.models[task]
	model.eval()
	model = model.to(self.device)

	if task == 'image_captioning':
	with torch.no_grad():
	caption = model.generate(im, sample=False, num_beams=3, max_length=20, min_length=5)
	return 'Caption: ' + caption[0]

	if task == 'visual_question_answering':
	with torch.no_grad():
	answer = model(im, question, train=False, inference='generate')
	return 'Answer: ' + answer[0]

	# image_text_matching
	itm_output = model(im, caption, match_head='itm')
	itm_score = torch.nn.functional.softmax(itm_output, dim=1)[:, 1]
	itc_score = model(im, caption, match_head='itc')
	return f'The image and text is matched with a probability of {itm_score.item():.4f}.\n' \
	f'The image feature and text feature has a cosine similarity of {itc_score.item():.4f}.'


	def load_image(image, image_size, device):
	raw_image = Image.open(str(image)).convert('RGB')

	w, h = raw_image.size

	transform = transforms.Compose([
	transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
	transforms.ToTensor(),
	transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
	])
	image = transform(raw_image).unsqueeze(0).to(device)
	return image