import base64 import requests from PIL import Image from io import BytesIO import argparse import os import random import numpy as np import torch import torch.backends.cudnn as cudnn from tqdm import tqdm from torchvision import transforms from torchvision.transforms.functional import InterpolationMode from torchvision.utils import save_image from pope_loader import POPEDataSet from minigpt4.common.dist_utils import get_rank from minigpt4.models import load_preprocess from minigpt4.common.config import Config from minigpt4.common.dist_utils import get_rank from minigpt4.common.registry import registry # imports modules for registration from minigpt4.datasets.builders import * from minigpt4.models import * from minigpt4.processors import * from minigpt4.runners import * from minigpt4.tasks import * # from PIL import Image from torchvision.utils import save_image import matplotlib.pyplot as plt import matplotlib as mpl import seaborn import json MODEL_EVAL_CONFIG_PATH = { "minigpt4": "eval_configs/minigpt4_eval.yaml", "instructblip": "eval_configs/instructblip_eval.yaml", "lrv_instruct": "eval_configs/lrv_instruct_eval.yaml", "shikra": "eval_configs/shikra_eval.yaml", "llava-1.5": "eval_configs/llava-1.5_eval.yaml", } INSTRUCTION_TEMPLATE = { "minigpt4": "###Human: ###Assistant:", "instructblip": "", "lrv_instruct": "###Human: ###Assistant:", "shikra": "USER: ASSISTANT:", "llava-1.5": "USER: ASSISTANT:" } GPT_JUDGE_PROMPT = ''' You are required to score the performance of two AI assistants in describing a given image. You should pay extra attention to the hallucination, which refers to the part of descriptions that are inconsistent with the image content, such as claiming the existence of something not present in the image or describing incorrectly in terms of the counts, positions, or colors of objects in the image. Please rate the responses of the assistants on a scale of 1 to 10, where a higher score indicates better performance, according to the following criteria: 1: Accuracy: whether the response is accurate with respect to the image content. Responses with fewer hallucinationsshould be given higher scores. 2: Detailedness: whether the response is rich in necessary details. Note that hallucinated descriptions should not countas necessary details. Please output the scores for each criterion, containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. Following the scores, please provide an explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment. [Assistant 1] {} [End of Assistant 1] [Assistant 2] {} [End of Assistant 2] Output format: Accuracy: Reason: Detailedness: Reason: ''' # OpenAI API Key API_KEY = "YOUR_API_KEY" def setup_seeds(config): seed = config.run_cfg.seed + get_rank() random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) cudnn.benchmark = False cudnn.deterministic = True def call_api(prompt, image_path): # Function to encode the image def encode_image(image_path): with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') # Getting the base64 string base64_image = encode_image(image_path) headers = { "Content-Type": "application/json", "Authorization": f"Bearer {API_KEY}" } payload = { "model": "gpt-4-vision-preview", "messages": [ { "role": "user", "content": [ { "type": "text", "text": prompt }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" } } ] } ], "max_tokens": 300 } response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) print(response.json().keys()) return response.json() def get_gpt4v_answer(prompt, image_path): while 1: try: res = call_api(prompt, image_path) if "choices" in res.keys(): return res["choices"][0]["message"]["content"] else: assert False except Exception as e: print("retry") # pass # return call_api(prompt, image_path) parser = argparse.ArgumentParser(description="POPE-Adv evaluation on LVLMs.") parser.add_argument("--model", type=str, help="model") parser.add_argument("--gpu-id", type=int, help="specify the gpu to load the model.") parser.add_argument( "--options", nargs="+", help="override some settings in the used config, the key-value pair " "in xxx=yyy format will be merged into config file (deprecate), " "change to --cfg-options instead.", ) parser.add_argument("--data_path", type=str, default="COCO_2014/val2014/", help="data path") parser.add_argument("--batch_size", type=int, help="batch size") parser.add_argument("--num_workers", type=int, default=2, help="num workers") parser.add_argument("--scale_factor", type=float, default=50) parser.add_argument("--threshold", type=int, default=15) parser.add_argument("--num_attn_candidates", type=int, default=5) parser.add_argument("--penalty_weights", type=float, default=1.0) args = parser.parse_known_args()[0] os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) args.cfg_path = MODEL_EVAL_CONFIG_PATH[args.model] cfg = Config(args) setup_seeds(cfg) device = torch.device("cuda") if torch.cuda.is_available() else "cpu" # ======================================== # Model Initialization # ======================================== print('Initializing Model') model_config = cfg.model_cfg model_config.device_8bit = args.gpu_id model_cls = registry.get_model_class(model_config.arch) model = model_cls.from_config(model_config).to(device) model.eval() processor_cfg = cfg.get_config().preprocess processor_cfg.vis_processor.eval.do_normalize = False vis_processors, txt_processors = load_preprocess(processor_cfg) print(vis_processors["eval"].transform) print("Done!") mean = (0.48145466, 0.4578275, 0.40821073) std = (0.26862954, 0.26130258, 0.27577711) norm = transforms.Normalize(mean, std) img_files = os.listdir(args.data_path) random.shuffle(img_files) base_path = "log/gpt4v-eval" if not os.path.exists(base_path + f"/{args.model}"): os.mkdir(base_path + f"/{args.model}") gpt_answer_records = {} assistant_answer_records = {} avg_hal_score_1 = 0 avg_hal_score_2 = 0 avg_det_score_1 = 0 avg_det_score_2 = 0 num_count = 0 for idx in range(50): img = img_files[idx] image_path = args.data_path + img raw_image = Image.open(image_path) raw_image = raw_image.convert("RGB") image = vis_processors["eval"](raw_image).unsqueeze(0) image = image.to(device) qu = "Please describe this image in detail." template = INSTRUCTION_TEMPLATE[args.model] qu = template.replace("", qu) assistant_answer_records[str(img)] = {} with torch.inference_mode(): with torch.no_grad(): out = model.generate( {"image": norm(image), "prompt":qu}, use_nucleus_sampling=False, num_beams=5, max_new_tokens=512, ) model_response_1 = out[0] assistant_answer_records[str(img)]["assistant_1"] = model_response_1 print("Beam-5 output:") print(model_response_1) with torch.inference_mode(): with torch.no_grad(): out = model.generate( {"image": norm(image), "prompt":qu}, use_nucleus_sampling=False, num_beams=5, max_new_tokens=512, output_attentions=True, opera_decoding=True, scale_factor=args.scale_factor, threshold=args.threshold, num_attn_candidates=args.num_attn_candidates, penalty_weights=args.penalty_weights, ) model_response_2 = out[0] assistant_answer_records[str(img)]["assistant_2"] = model_response_2 print("OPERA output:") print(model_response_2) # gpt-4v eval prompt = GPT_JUDGE_PROMPT.format(model_response_1, model_response_2) gpt_answer = get_gpt4v_answer(prompt, image_path) print(gpt_answer) gpt_answer_records[str(img)] = gpt_answer print(gpt_answer.split("Accuracy: ")[-1].split("\n")[0].split(" ")) print(len(gpt_answer.split("Accuracy: ")[-1].split("\n")[0].split(" "))) try: hal_score_1, hal_score_2 = gpt_answer.split("Accuracy: ")[-1].split("\n")[0].split(" ") det_score_1, det_score_2 = gpt_answer.split("Detailedness: ")[-1].split("\n")[0].split(" ") except: continue avg_hal_score_1 += int(hal_score_1) avg_hal_score_2 += int(hal_score_2) avg_det_score_1 += int(det_score_1) avg_det_score_2 += int(det_score_2) num_count += 1 print("=========================================") # dump metric file with open(os.path.join(base_path + f"/{args.model}", 'answers.json'), "w") as f: json.dump(assistant_answer_records, f) # dump metric file with open(os.path.join(base_path + f"/{args.model}", 'records.json'), "w") as f: json.dump(gpt_answer_records, f) avg_score = float(avg_hal_score_1) / num_count avg_score = float(avg_hal_score_2) / num_count avg_score = float(avg_det_score_1) / num_count avg_score = float(avg_det_score_2) / num_count print(f"The avg hal score for Assistant 1 and Assistent 2: {avg_hal_score_1}; {avg_hal_score_2}") print(f"The avg det score for Assistant 1 and Assistent 2: {avg_det_score_1}; {avg_det_score_2}")