import torch import torch.nn as nn import numpy as np import json import captioning.utils.opts as opts import captioning.models as models import captioning.utils.misc as utils import pytorch_lightning as pl import gradio as gr # Checkpoint class class ModelCheckpoint(pl.callbacks.ModelCheckpoint): def on_keyboard_interrupt(self, trainer, pl_module): # Save model when keyboard interrupt filepath = os.path.join(self.dirpath, self.prefix + 'interrupt.ckpt') self._save_model(filepath) device = 'cpu' #@param ["cuda", "cpu"] {allow-input: true} reward = 'clips_grammar' #@param ["mle", "cider", "clips", "cider_clips", "clips_grammar"] {allow-input: true} if reward == 'mle': cfg = f'./configs/phase1/clipRN50_{reward}.yml' else: cfg = f'./configs/phase2/clipRN50_{reward}.yml' print("Loading cfg from", cfg) opt = opts.parse_opt(parse=False, cfg=cfg) import gdown if reward == "mle": url = "https://drive.google.com/drive/folders/1hfHWDn5iXsdjB63E5zdZBAoRLWHQC3LD" elif reward == "cider": url = "https://drive.google.com/drive/folders/1MnSmCd8HFnBvQq_4K-q4vsVkzEw0OIOs" elif reward == "clips": url = "https://drive.google.com/drive/folders/1toceycN-qilHsbYjKalBLtHJck1acQVe" elif reward == "cider_clips": url = "https://drive.google.com/drive/folders/1toceycN-qilHsbYjKalBLtHJck1acQVe" elif reward == "clips_grammar": url = "https://drive.google.com/drive/folders/1nSX9aS7pPK4-OTHYtsUD_uEkwIQVIV7W" gdown.download_folder(url, quiet=True, use_cookies=False, output="save/") url = "https://drive.google.com/uc?id=1HNRE1MYO9wxmtMHLC8zURraoNFu157Dp" gdown.download(url, quiet=True, use_cookies=False, output="data/") dict_json = json.load(open('./data/cocotalk.json')) print(dict_json.keys()) ix_to_word = dict_json['ix_to_word'] vocab_size = len(ix_to_word) print('vocab size:', vocab_size) seq_length = 1 opt.vocab_size = vocab_size opt.seq_length = seq_length opt.batch_size = 1 opt.vocab = ix_to_word # opt.use_grammar = False model = models.setup(opt) del opt.vocab ckpt_path = opt.checkpoint_path + '-last.ckpt' print("Loading checkpoint from", ckpt_path) raw_state_dict = torch.load( ckpt_path, map_location=device) strict = True state_dict = raw_state_dict['state_dict'] if '_vocab' in state_dict: model.vocab = utils.deserialize(state_dict['_vocab']) del state_dict['_vocab'] elif strict: raise KeyError if '_opt' in state_dict: saved_model_opt = utils.deserialize(state_dict['_opt']) del state_dict['_opt'] # Make sure the saved opt is compatible with the curren topt need_be_same = ["caption_model", "rnn_type", "rnn_size", "num_layers"] for checkme in need_be_same: if getattr(saved_model_opt, checkme) in ['updown', 'topdown'] and \ getattr(opt, checkme) in ['updown', 'topdown']: continue assert getattr(saved_model_opt, checkme) == getattr( opt, checkme), "Command line argument and saved model disagree on '%s' " % checkme elif strict: raise KeyError res = model.load_state_dict(state_dict, strict) print(res) model = model.to(device) model.eval(); import clip from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize from PIL import Image from timm.models.vision_transformer import resize_pos_embed clip_model, clip_transform = clip.load("RN50", jit=False, device=device) preprocess = Compose([ Resize((448, 448), interpolation=Image.BICUBIC), CenterCrop((448, 448)), ToTensor() ]) image_mean = torch.Tensor([0.48145466, 0.4578275, 0.40821073]).to(device).reshape(3, 1, 1) image_std = torch.Tensor([0.26862954, 0.26130258, 0.27577711]).to(device).reshape(3, 1, 1) num_patches = 196 #600 * 1000 // 32 // 32 pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, clip_model.visual.attnpool.positional_embedding.shape[-1], device=device),) pos_embed.weight = resize_pos_embed(clip_model.visual.attnpool.positional_embedding.unsqueeze(0), pos_embed) clip_model.visual.attnpool.positional_embedding = pos_embed def inference(img): with torch.no_grad(): image = preprocess(img) image = torch.tensor(np.stack([image])).to(device) image -= image_mean image /= image_std tmp_att, tmp_fc = clip_model.encode_image(image) tmp_att = tmp_att[0].permute(1, 2, 0) tmp_fc = tmp_fc[0] att_feat = tmp_att fc_feat = tmp_fc # Inference configurations eval_kwargs = {} eval_kwargs.update(vars(opt)) verbose = eval_kwargs.get('verbose', True) verbose_beam = eval_kwargs.get('verbose_beam', 0) verbose_loss = eval_kwargs.get('verbose_loss', 1) # dataset = eval_kwargs.get('dataset', 'coco') beam_size = eval_kwargs.get('beam_size', 1) sample_n = eval_kwargs.get('sample_n', 1) remove_bad_endings = eval_kwargs.get('remove_bad_endings', 0) with torch.no_grad(): fc_feats = torch.zeros((1,0)).to(device) att_feats = att_feat.view(1, 196, 2048).float().to(device) att_masks = None # forward the model to also get generated samples for each image # Only leave one feature for each image, in case duplicate sample tmp_eval_kwargs = eval_kwargs.copy() tmp_eval_kwargs.update({'sample_n': 1}) seq, seq_logprobs = model( fc_feats, att_feats, att_masks, opt=tmp_eval_kwargs, mode='sample') seq = seq.data sents = utils.decode_sequence(model.vocab, seq) return sents[0] demo = gr.Blocks() with demo: gr.Markdown( """ # Gradio Demo for [j-min/CLIP-Caption-Reward](https://github.com/j-min/CLIP-Caption-Reward) """) inp = gr.Image(type="pil") out = gr.Textbox() image_button = gr.Button("Run") image_button.click(fn=inference, inputs=inp, outputs=out, api_name="clip_caption") demo.launch()