import numpy as np # this should come first to mitigate mlk-service bug from src.models.utils import get_image_arr, load_model from src.data import TAIMGANTokenizer from torchvision import transforms from src.config import config_dict from pathlib import Path from enum import IntEnum, auto from PIL import Image import gradio as gr import torch from src.models.modules import ( VGGEncoder, InceptionEncoder, TextEncoder, Generator ) ########## # PARAMS # ########## IMG_CHANS = 3 # RGB channels for image IMG_HW = 256 # height and width of images HIDDEN_DIM = 128 # hidden dimensions of lstm cell in one direction C = 2 * HIDDEN_DIM # length of embeddings Ng = config_dict["Ng"] cond_dim = config_dict["condition_dim"] z_dim = config_dict["noise_dim"] ############### # LOAD MODELS # ############### models = { "COCO": { "dir": "weights/coco" }, "Bird": { "dir": "weights/bird" }, "UTKFace": { "dir": "weights/utkface" } } for model_name in models: # create tokenizer models[model_name]["tokenizer"] = TAIMGANTokenizer(captions_path=f"{models[model_name]['dir']}/captions.pickle") vocab_size = len(models[model_name]["tokenizer"].word_to_ix) # instantiate models models[model_name]["generator"] = Generator(Ng=Ng, D=C, conditioning_dim=cond_dim, noise_dim=z_dim).eval() models[model_name]["lstm"] = TextEncoder(vocab_size=vocab_size, emb_dim=C, hidden_dim=HIDDEN_DIM).eval() models[model_name]["vgg"] = VGGEncoder().eval() models[model_name]["inception"] = InceptionEncoder(D=C).eval() # load models load_model( generator=models[model_name]["generator"], discriminator=None, image_encoder=models[model_name]["inception"], text_encoder=models[model_name]["lstm"], output_dir=Path(models[model_name]["dir"]), device=torch.device("cpu") ) def change_image_with_text(image: Image, text: str, model_name: str) -> Image: """ Create an image modified by text from the original image and save it with _modified postfix :param gr.Image image: Path to the image :param str text: Desired caption """ global models tokenizer = models[model_name]["tokenizer"] G = models[model_name]["generator"] lstm = models[model_name]["lstm"] inception = models[model_name]["inception"] vgg = models[model_name]["vgg"] # generate some noise noise = torch.rand(z_dim).unsqueeze(0) # transform input text and get masks with embeddings tokens = torch.tensor(tokenizer.encode(text)).unsqueeze(0) mask = (tokens == tokenizer.pad_token_id) word_embs, sent_embs = lstm(tokens) # open the image and transform it to the tensor image = transforms.Compose([ transforms.ToTensor(), transforms.Resize((IMG_HW, IMG_HW)), transforms.Normalize( mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5) ) ])(image).unsqueeze(0) # obtain visual features of the image vgg_features = vgg(image) local_features, global_features = inception(image) # generate new image from the old one fake_image, _, _ = G(noise, sent_embs, word_embs, global_features, local_features, vgg_features, mask) # denormalize the image fake_image = Image.fromarray(get_image_arr(fake_image)[0]) # return image in gradio format return fake_image ########## # GRADIO # ########## demo = gr.Interface( fn=change_image_with_text, inputs=[gr.Image(type="pil"), "text", gr.inputs.Dropdown(list(models.keys()))], outputs=gr.Image(type="pil"), examples=[ ["src/data/stubs/car.jpeg", "black car on the green road", "COCO"], ["src/data/stubs/lady.jpg", "lady with blue eyes", "UTKFace"], ["src/data/stubs/bird.jpg", "white bird with black wings", "Bird"] ] ) demo.launch(debug=True)