Spaces:

KVAkash
/

operaex2

Running

App Files Files Community

operaex2 / OPERA /gpt4v_eval.py

KVAkash

Upload 2398 files

bc85824 verified about 2 months ago

raw

history blame

10.4 kB

	import base64
	import requests
	from PIL import Image
	from io import BytesIO



	import argparse
	import os
	import random

	import numpy as np
	import torch
	import torch.backends.cudnn as cudnn
	from tqdm import tqdm

	from torchvision import transforms
	from torchvision.transforms.functional import InterpolationMode
	from torchvision.utils import save_image

	from pope_loader import POPEDataSet
	from minigpt4.common.dist_utils import get_rank
	from minigpt4.models import load_preprocess

	from minigpt4.common.config import Config
	from minigpt4.common.dist_utils import get_rank
	from minigpt4.common.registry import registry

	# imports modules for registration
	from minigpt4.datasets.builders import *
	from minigpt4.models import *
	from minigpt4.processors import *
	from minigpt4.runners import *
	from minigpt4.tasks import *

	# from PIL import Image
	from torchvision.utils import save_image

	import matplotlib.pyplot as plt
	import matplotlib as mpl
	import seaborn
	import json


	MODEL_EVAL_CONFIG_PATH = {
	"minigpt4": "eval_configs/minigpt4_eval.yaml",
	"instructblip": "eval_configs/instructblip_eval.yaml",
	"lrv_instruct": "eval_configs/lrv_instruct_eval.yaml",
	"shikra": "eval_configs/shikra_eval.yaml",
	"llava-1.5": "eval_configs/llava-1.5_eval.yaml",
	}

	INSTRUCTION_TEMPLATE = {
	"minigpt4": "###Human: <Img><ImageHere></Img> <question> ###Assistant:",
	"instructblip": "<ImageHere><question>",
	"lrv_instruct": "###Human: <Img><ImageHere></Img> <question> ###Assistant:",
	"shikra": "USER: <im_start><ImageHere><im_end> <question> ASSISTANT:",
	"llava-1.5": "USER: <ImageHere> <question> ASSISTANT:"
	}




	GPT_JUDGE_PROMPT = '''
	You are required to score the performance of two AI assistants in describing a given image. You should pay extra attention to the hallucination, which refers to the part of descriptions that are inconsistent with the image content, such as claiming the existence of something not present in the image or describing incorrectly in terms of the counts, positions, or colors of objects in the image. Please rate the responses of the assistants on a scale of 1 to 10, where a higher score indicates better performance, according to the following criteria:
	1: Accuracy: whether the response is accurate with respect to the image content. Responses with fewer hallucinationsshould be given higher scores.
	2: Detailedness: whether the response is rich in necessary details. Note that hallucinated descriptions should not countas necessary details.
	Please output the scores for each criterion, containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. Following the scores, please provide an explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.

	[Assistant 1]
	{}
	[End of Assistant 1]

	[Assistant 2]
	{}
	[End of Assistant 2]

	Output format:
	Accuracy: <Scores of the two answers>
	Reason:

	Detailedness: <Scores of the two answers>
	Reason:
	'''


	# OpenAI API Key
	API_KEY = "YOUR_API_KEY"



	def setup_seeds(config):
	seed = config.run_cfg.seed + get_rank()

	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)

	cudnn.benchmark = False
	cudnn.deterministic = True




	def call_api(prompt, image_path):
	# Function to encode the image
	def encode_image(image_path):
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode('utf-8')

	# Getting the base64 string
	base64_image = encode_image(image_path)

	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {API_KEY}"
	}

	payload = {
	"model": "gpt-4-vision-preview",
	"messages": [
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": prompt
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{base64_image}"
	}
	}
	]
	}
	],
	"max_tokens": 300
	}

	response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

	print(response.json().keys())
	return response.json()


	def get_gpt4v_answer(prompt, image_path):
	while 1:
	try:
	res = call_api(prompt, image_path)
	if "choices" in res.keys():
	return res["choices"][0]["message"]["content"]
	else:
	assert False
	except Exception as e:
	print("retry")
	# pass
	# return call_api(prompt, image_path)


	parser = argparse.ArgumentParser(description="POPE-Adv evaluation on LVLMs.")
	parser.add_argument("--model", type=str, help="model")
	parser.add_argument("--gpu-id", type=int, help="specify the gpu to load the model.")
	parser.add_argument(
	"--options",
	nargs="+",
	help="override some settings in the used config, the key-value pair "
	"in xxx=yyy format will be merged into config file (deprecate), "
	"change to --cfg-options instead.",
	)
	parser.add_argument("--data_path", type=str, default="COCO_2014/val2014/", help="data path")
	parser.add_argument("--batch_size", type=int, help="batch size")
	parser.add_argument("--num_workers", type=int, default=2, help="num workers")

	parser.add_argument("--scale_factor", type=float, default=50)
	parser.add_argument("--threshold", type=int, default=15)
	parser.add_argument("--num_attn_candidates", type=int, default=5)
	parser.add_argument("--penalty_weights", type=float, default=1.0)
	args = parser.parse_known_args()[0]



	os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id)
	args.cfg_path = MODEL_EVAL_CONFIG_PATH[args.model]
	cfg = Config(args)
	setup_seeds(cfg)
	device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

	# ========================================
	# Model Initialization
	# ========================================
	print('Initializing Model')

	model_config = cfg.model_cfg
	model_config.device_8bit = args.gpu_id
	model_cls = registry.get_model_class(model_config.arch)
	model = model_cls.from_config(model_config).to(device)
	model.eval()
	processor_cfg = cfg.get_config().preprocess
	processor_cfg.vis_processor.eval.do_normalize = False
	vis_processors, txt_processors = load_preprocess(processor_cfg)
	print(vis_processors["eval"].transform)
	print("Done!")

	mean = (0.48145466, 0.4578275, 0.40821073)
	std = (0.26862954, 0.26130258, 0.27577711)
	norm = transforms.Normalize(mean, std)




	img_files = os.listdir(args.data_path)
	random.shuffle(img_files)

	base_path = "log/gpt4v-eval"
	if not os.path.exists(base_path + f"/{args.model}"):
	os.mkdir(base_path + f"/{args.model}")

	gpt_answer_records = {}
	assistant_answer_records = {}
	avg_hal_score_1 = 0
	avg_hal_score_2 = 0
	avg_det_score_1 = 0
	avg_det_score_2 = 0
	num_count = 0

	for idx in range(50):
	img = img_files[idx]
	image_path = args.data_path + img
	raw_image = Image.open(image_path)
	raw_image = raw_image.convert("RGB")
	image = vis_processors["eval"](raw_image).unsqueeze(0)
	image = image.to(device)
	qu = "Please describe this image in detail."

	template = INSTRUCTION_TEMPLATE[args.model]
	qu = template.replace("<question>", qu)
	assistant_answer_records[str(img)] = {}

	with torch.inference_mode():
	with torch.no_grad():
	out = model.generate(
	{"image": norm(image), "prompt":qu},
	use_nucleus_sampling=False,
	num_beams=5,
	max_new_tokens=512,
	)
	model_response_1 = out[0]
	assistant_answer_records[str(img)]["assistant_1"] = model_response_1
	print("Beam-5 output:")
	print(model_response_1)


	with torch.inference_mode():
	with torch.no_grad():
	out = model.generate(
	{"image": norm(image), "prompt":qu},
	use_nucleus_sampling=False,
	num_beams=5,
	max_new_tokens=512,
	output_attentions=True,
	opera_decoding=True,
	scale_factor=args.scale_factor,
	threshold=args.threshold,
	num_attn_candidates=args.num_attn_candidates,
	penalty_weights=args.penalty_weights,
	)
	model_response_2 = out[0]
	assistant_answer_records[str(img)]["assistant_2"] = model_response_2
	print("OPERA output:")
	print(model_response_2)

	# gpt-4v eval
	prompt = GPT_JUDGE_PROMPT.format(model_response_1, model_response_2)

	gpt_answer = get_gpt4v_answer(prompt, image_path)
	print(gpt_answer)
	gpt_answer_records[str(img)] = gpt_answer
	print(gpt_answer.split("Accuracy: ")[-1].split("\n")[0].split(" "))
	print(len(gpt_answer.split("Accuracy: ")[-1].split("\n")[0].split(" ")))
	try:
	hal_score_1, hal_score_2 = gpt_answer.split("Accuracy: ")[-1].split("\n")[0].split(" ")
	det_score_1, det_score_2 = gpt_answer.split("Detailedness: ")[-1].split("\n")[0].split(" ")
	except:
	continue
	avg_hal_score_1 += int(hal_score_1)
	avg_hal_score_2 += int(hal_score_2)
	avg_det_score_1 += int(det_score_1)
	avg_det_score_2 += int(det_score_2)
	num_count += 1
	print("=========================================")

	# dump metric file
	with open(os.path.join(base_path + f"/{args.model}", 'answers.json'), "w") as f:
	json.dump(assistant_answer_records, f)

	# dump metric file
	with open(os.path.join(base_path + f"/{args.model}", 'records.json'), "w") as f:
	json.dump(gpt_answer_records, f)

	avg_score = float(avg_hal_score_1) / num_count
	avg_score = float(avg_hal_score_2) / num_count
	avg_score = float(avg_det_score_1) / num_count
	avg_score = float(avg_det_score_2) / num_count
	print(f"The avg hal score for Assistant 1 and Assistent 2: {avg_hal_score_1}; {avg_hal_score_2}")
	print(f"The avg det score for Assistant 1 and Assistent 2: {avg_det_score_1}; {avg_det_score_2}")