# -------------------------------------------------------- # X-Decoder -- Generalized Decoding for Pixel, Image, and Language # Copyright (c) 2022 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Xueyan Zou (xueyan@cs.wisc.edu) # -------------------------------------------------------- import cv2 import torch import numpy as np from PIL import Image from torchvision import transforms t = [] t.append(transforms.Resize(224, interpolation=Image.BICUBIC)) transform = transforms.Compose(t) t = [] t.append(transforms.Resize(512, interpolation=Image.BICUBIC)) transform_v = transforms.Compose(t) def image_captioning(model, image, texts, inpainting_text, *args, **kwargs): with torch.no_grad(): image_ori = transform_v(image) width = image_ori.size[0] height = image_ori.size[1] image_ori = np.asarray(image_ori) image = transform(image) image = np.asarray(image) images = torch.from_numpy(image.copy()).permute(2,0,1).cuda() batch_inputs = [{'image': images, 'height': height, 'width': width, 'image_id': 0}] outputs = model.model.evaluate_captioning(batch_inputs) text = outputs[-1]['captioning_text'] image_ori = image_ori.copy() cv2.rectangle(image_ori, (0, height-60), (width, height), (0,0,0), -1) font = cv2.FONT_HERSHEY_DUPLEX fontScale = 1.2 thickness = 2 lineType = 2 bottomLeftCornerOfText = (10, height-20) fontColor = [255,255,255] cv2.putText(image_ori, text, bottomLeftCornerOfText, font, fontScale, fontColor, thickness, lineType) torch.cuda.empty_cache() return Image.fromarray(image_ori), text, None