Spaces:
Runtime error
Runtime error
File size: 5,682 Bytes
0392181 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
"""
=========================================================================================
Trojan VQA
Written by Matthew Walmer
Inference wrapper for trained OpenVQA models
=========================================================================================
"""
import yaml, os, torch, re, json
import numpy as np
import torch.nn as nn
from openvqa.models.model_loader import ModelLoader
from openvqa.models.model_loader import CfgLoader
root = os.path.dirname(os.path.realpath(__file__))
# Helper to replace argparse for loading proper inference settings
class Openvqa_Args_Like():
def __init__(self, model_type, model_path, nb, over_fs=1024, gpu='0'):
self.RUN_MODE = 'val'
self.MODEL = model_type
self.DATASET = 'vqa'
self.SPLIT = 'train'
self.BS = 64
self.GPU = gpu
self.SEED = 1234
self.VERSION = 'temp'
self.RESUME = 'True'
self.CKPT_V = ''
self.CKPT_E = ''
self.CKPT_PATH = model_path
self.NUM_WORKERS = 1
self.PINM = 'True'
self.VERBOSE = 'False'
self.DETECTOR = ''
self.OVER_FS = over_fs
self.OVER_NB = int(nb)
# Wrapper for inference with a pre-trained OpenVQA model. During init, user specifies
# the model type, model file (.pkl) path, the number of input image
# features, and optionally the feature size and gpu to run on. The function 'run' can
# then run inference on two simple inputs: an image feature tensor, and a question
# given as a string.
class Openvqa_Wrapper():
def __init__(self, model_type, model_path, nb, over_fs=1024, gpu='0'):
self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# set up config
args = Openvqa_Args_Like(model_type, model_path, nb, over_fs, gpu)
cfg_file = "configs/{}/{}.yml".format(args.DATASET, args.MODEL)
if not os.path.isfile(cfg_file):
cfg_file = "{}/configs/{}/{}.yml".format(root, args.DATASET, args.MODEL)
with open(cfg_file, 'r') as f:
yaml_dict = yaml.load(f)
__C = CfgLoader(yaml_dict['MODEL_USE']).load()
args = __C.str_to_bool(args)
args_dict = __C.parse_to_dict(args)
args_dict = {**yaml_dict, **args_dict}
__C.add_args(args_dict)
__C.proc(check_path=False)
# override feature size
if __C.OVER_FS != -1 or __C.OVER_NB != -1:
NEW_FS = 2048
NEW_NB = 100
if __C.OVER_FS != -1:
print('Overriding feature size to: ' + str(__C.OVER_FS))
NEW_FS = __C.OVER_FS
__C.IMG_FEAT_SIZE = NEW_FS
if __C.OVER_NB != -1:
print('Overriding number of boxes to: ' + str(__C.OVER_NB))
NEW_NB = __C.OVER_NB
__C.FEAT_SIZE['vqa']['FRCN_FEAT_SIZE'] = (NEW_NB, NEW_FS)
__C.FEAT_SIZE['vqa']['BBOX_FEAT_SIZE'] = (NEW_NB, 5)
# update path information
__C.update_paths()
# prep
token_size = 20573
ans_size = 3129
pretrained_emb = np.zeros([token_size, 300], dtype=np.float32)
# load network
net = ModelLoader(__C).Net(
__C,
pretrained_emb,
token_size,
ans_size
)
net.to(self.device)
net.eval()
if __C.N_GPU > 1:
net = nn.DataParallel(net, device_ids=__C.DEVICES)
# Load checkpoint
print(' ========== Loading checkpoint')
print('Loading ckpt from {}'.format(model_path))
ckpt = torch.load(model_path, map_location=self.device)
print('Finish!')
if __C.N_GPU > 1:
net.load_state_dict(ckpt_proc(ckpt['state_dict']))
else:
net.load_state_dict(ckpt['state_dict'])
self.model = net
# Load tokenizer, and answers
token_file = '{}/openvqa/datasets/vqa/token_dict.json'.format(root)
self.token_to_ix = json.load(open(token_file, 'r'))
ans_dict = '{}/openvqa/datasets/vqa/answer_dict.json'.format(root)
ans_to_ix = json.load(open(ans_dict, 'r'))[0]
self.ix_to_ans = {}
for key in ans_to_ix:
self.ix_to_ans[ans_to_ix[key]] = key
# based on version in vqa_loader.py
def proc_ques(self, ques, token_to_ix, max_token):
ques_ix = np.zeros(max_token, np.int64)
words = re.sub(
r"([.,'!?\"()*#:;])",
'',
ques.lower()
).replace('-', ' ').replace('/', ' ').split()
for ix, word in enumerate(words):
if word in token_to_ix:
ques_ix[ix] = token_to_ix[word]
else:
ques_ix[ix] = token_to_ix['UNK']
if ix + 1 == max_token:
break
return ques_ix
# inputs are a tensor of image features, shape [nb, 1024]
# and a raw question in string form. bbox features input is only used
# by mmnasnet models.
def run(self, image_features, raw_question, bbox_features):
ques_ix = self.proc_ques(raw_question, self.token_to_ix, max_token=14)
frcn_feat_iter = torch.unsqueeze(image_features, 0).to(self.device)
grid_feat_iter = torch.zeros(1).to(self.device)
bbox_feat_iter = torch.unsqueeze(bbox_features, 0).to(self.device)
ques_ix_iter = torch.unsqueeze(torch.from_numpy(ques_ix),0).to(self.device)
pred = self.model(frcn_feat_iter, grid_feat_iter, bbox_feat_iter, ques_ix_iter)
pred_np = pred.cpu().data.numpy()
pred_argmax = np.argmax(pred_np, axis=1)
ans = self.ix_to_ans[pred_argmax[0]]
return ans
|