DenseLabelDev / vlm /datasets /evaluation /pope_dataset.py

Upload folder using huggingface_hub

032e687 verified 7 months ago

5.22 kB

	import os

	import pandas as pd
	from mmengine.dist import master_only
	from PIL import Image

	from xtuner.registry import BUILDER
	from mmengine.logging import print_log
	from .base_eval_dataset import BaseEvalDataset

	from .utils import YOrN_Extraction, load_jsonl
	from .utils import custom_data_process


	def eval_func(pred_list, label_list):
	pos = 1
	neg = 0
	yes_ratio = pred_list.count(1) / len(pred_list)

	TP, TN, FP, FN = 0, 0, 0, 0
	for pred, label in zip(pred_list, label_list):
	if pred == pos and label == pos:
	TP += 1
	elif pred == pos and label == neg:
	FP += 1
	elif pred == neg and label == neg:
	TN += 1
	elif pred == neg and label == pos:
	FN += 1

	print_log('TP\tFP\tTN\tFN\t', 'current')
	print_log(f'{TP}\t{FP}\t{TN}\t{FN}', 'current')

	precision = float(TP) / float(TP + FP)
	recall = float(TP) / float(TP + FN)
	f1 = 2 * precision * recall / (precision + recall)
	acc = (TP + TN) / (TP + TN + FP + FN)
	print_log(f'Accuracy: {acc}', 'current')
	print_log(f'Precision: {precision}', 'current')
	print_log(f'Recall: {recall}', 'current')
	print_log(f'F1 score: {f1}', 'current')
	print_log(f'Yes ratio: {yes_ratio}', 'current')
	return f1


	class POPEDataset(BaseEvalDataset):
	METAINFO: dict = dict(name='pope')

	def __init__(self, data_file, coco_val_path, image_processor,
	pad_image_to_square=True,
	metainfo=None):
	super().__init__(metainfo)
	if isinstance(data_file, str):
	data_file = [data_file]
	self.raw_data = [load_jsonl(f) for f in data_file]

	self.name = [
	os.path.splitext(os.path.basename(f))[0] for f in data_file
	]

	self.coco_val_path = coco_val_path
	self.image_processor = BUILDER.build(image_processor)
	self.pad_image_to_square = pad_image_to_square

	self.results_xlsx_path = 'pope-results.xlsx'
	self.data = self.load_data_list()

	def get_image(self, image):
	image = Image.open(os.path.join(self.coco_val_path, image))
	return image

	def __len__(self):
	return len(self.data)

	def load_data_list(self):
	data_list = []
	idx = 0
	for data_idx in range(len(self.raw_data)):
	for sample_idx in range(len(self.raw_data[data_idx])):
	sample = self.raw_data[data_idx][sample_idx]
	index = sample['question_id']
	image_path = sample['image']
	question = sample['text']
	answer = sample['label']
	category = self.name[data_idx]
	assert answer in ['yes', 'no']
	data = {
	'img_id': idx,
	'index': index,
	'img': image_path,
	'question': question,
	'answer': answer,
	'category': category
	}
	data_list.append(data)
	idx += 1
	return data_list

	def __getitem__(self, idx):
	data = self.data[idx]
	data_dict = custom_data_process(self, data)
	return data_dict

	@master_only
	def evaluate(self, result, work_dir, show=True):
	orig_index = [x['img_id'] for x in self.data]
	results = []
	for pred_dict in result:
	index = pred_dict['img_id']
	new_index = orig_index.index(index)
	filtered_rows = self.data[new_index]
	cur_result = {}
	cur_result['question'] = filtered_rows.get('question')
	cur_result['prediction'] = pred_dict['prediction']
	cur_result['category'] = filtered_rows['category']
	cur_result['index'] = filtered_rows.get('index')
	cur_result['answer'] = filtered_rows.get('answer')
	results.append(cur_result)

	results_df = pd.DataFrame(results)
	with pd.ExcelWriter(
	os.path.join(work_dir, self.results_xlsx_path),
	engine='openpyxl') as writer:
	results_df.to_excel(writer, index=False)

	score = 0
	for sub_name in self.name:
	sub_results = [x for x in results if x['category'] == sub_name]
	pred_list = [
	int(YOrN_Extraction(x['prediction']) == 'Yes')
	for x in sub_results
	]
	label_list = [
	int(YOrN_Extraction(x['answer']) == 'Yes') for x in sub_results
	]
	print_log('============================================', 'current')
	print_log('Category: {}, # samples: {}'.format(sub_name,
	len(sub_results)), 'current')
	cur_f1 = eval_func(pred_list, label_list)
	score += cur_f1

	score /= len(self.name)
	print_log('============================================', 'current')
	print_log(f'Average F1-score: {score}', 'current')
	print_log('============================================', 'current')
	print_log('POPE successfully finished evaluating', 'current')
	return score