import argparse import os import random import numpy as np import torch import torch.backends.cudnn as cudnn from tqdm import tqdm from torchvision import transforms from torchvision.transforms.functional import InterpolationMode from torchvision.utils import save_image from pope_loader import POPEDataSet from minigpt4.common.dist_utils import get_rank from minigpt4.models import load_preprocess from minigpt4.common.config import Config from minigpt4.common.dist_utils import get_rank from minigpt4.common.registry import registry # imports modules for registration from minigpt4.datasets.builders import * from minigpt4.models import * from minigpt4.processors import * from minigpt4.runners import * from minigpt4.tasks import * from PIL import Image from torchvision.utils import save_image import matplotlib.pyplot as plt import matplotlib as mpl import seaborn import json MODEL_EVAL_CONFIG_PATH = { "minigpt4": "eval_configs/minigpt4_eval.yaml", "instructblip": "eval_configs/instructblip_eval.yaml", "lrv_instruct": "eval_configs/lrv_instruct_eval.yaml", "shikra": "eval_configs/shikra_eval.yaml", "llava-1.5": "eval_configs/llava-1.5_eval.yaml", } INSTRUCTION_TEMPLATE = { "minigpt4": "###Human: ###Assistant:", "instructblip": "", "lrv_instruct": "###Human: ###Assistant:", "shikra": "USER: ASSISTANT:", "llava-1.5": "USER: ASSISTANT:" } def setup_seeds(config): seed = config.run_cfg.seed + get_rank() random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) cudnn.benchmark = False cudnn.deterministic = True parser = argparse.ArgumentParser(description="POPE-Adv evaluation on LVLMs.") parser.add_argument("--model", type=str, help="model") parser.add_argument("--gpu-id", type=int, help="specify the gpu to load the model.") parser.add_argument( "--options", nargs="+", help="override some settings in the used config, the key-value pair " "in xxx=yyy format will be merged into config file (deprecate), " "change to --cfg-options instead.", ) parser.add_argument("--data_path", type=str, default="COCO_2014/val2014/", help="data path") parser.add_argument("--batch_size", type=int, default=1, help="batch size") parser.add_argument("--num_workers", type=int, default=2, help="num workers") parser.add_argument("--beam", type=int) parser.add_argument("--sample", action='store_true') parser.add_argument("--scale_factor", type=float, default=50) parser.add_argument("--threshold", type=int, default=15) parser.add_argument("--num_attn_candidates", type=int, default=5) parser.add_argument("--penalty_weights", type=float, default=1.0) args = parser.parse_known_args()[0] os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) args.cfg_path = MODEL_EVAL_CONFIG_PATH[args.model] cfg = Config(args) setup_seeds(cfg) device = torch.device("cuda") if torch.cuda.is_available() else "cpu" # ======================================== # Model Initialization # ======================================== print('Initializing Model') model_config = cfg.model_cfg model_config.device_8bit = args.gpu_id model_cls = registry.get_model_class(model_config.arch) model = model_cls.from_config(model_config).to(device) model.eval() processor_cfg = cfg.get_config().preprocess processor_cfg.vis_processor.eval.do_normalize = False vis_processors, txt_processors = load_preprocess(processor_cfg) print(vis_processors["eval"].transform) print("Done!") mean = (0.48145466, 0.4578275, 0.40821073) std = (0.26862954, 0.26130258, 0.27577711) norm = transforms.Normalize(mean, std) img_files = os.listdir(args.data_path) random.shuffle(img_files) with open(args.data_path + '../annotations_trainval2014/annotations/instances_val2014.json', 'r') as f: lines = f.readlines() coco_anns = json.loads(lines[0]) img_dict = {} categories = coco_anns["categories"] category_names = [c["name"] for c in categories] category_dict = {int(c["id"]): c["name"] for c in categories} for img_info in coco_anns["images"]: img_dict[img_info["id"]] = {"name": img_info["file_name"], "anns": []} for ann_info in coco_anns["annotations"]: img_dict[ann_info["image_id"]]["anns"].append( category_dict[ann_info["category_id"]] ) base_dir = "./log/" + args.model if not os.path.exists(base_dir): os.mkdir(base_dir) for img_id in tqdm(range(len(img_files))): if img_id == 500: break img_file = img_files[img_id] img_id = int(img_file.split(".jpg")[0][-6:]) img_info = img_dict[img_id] assert img_info["name"] == img_file img_anns = set(img_info["anns"]) img_save = {} img_save["image_id"] = img_id image_path = args.data_path + img_file raw_image = Image.open(image_path).convert("RGB") image = vis_processors["eval"](raw_image).unsqueeze(0) image = image.to(device) qu = "Please describe this image in detail." template = INSTRUCTION_TEMPLATE[args.model] qu = template.replace("", qu) with torch.inference_mode(): with torch.no_grad(): out = model.generate( {"image": norm(image), "prompt":qu}, use_nucleus_sampling=args.sample, num_beams=args.beam, max_new_tokens=512, output_attentions=True, opera_decoding=True, scale_factor=args.scale_factor, threshold=args.threshold, num_attn_candidates=args.num_attn_candidates, penalty_weights=args.penalty_weights, ) img_save["caption"] = out[0] # dump metric file with open(os.path.join(base_dir, 'ours-s_{}-t_{}-num_can_{}-p_{}.jsonl'.format(args.scale_factor, args.threshold, args.num_attn_candidates, args.penalty_weights)), "a") as f: json.dump(img_save, f) f.write('\n')