Spaces:

CVPR
/

regionclip-demo

Runtime error

regionclip-demo / detectron2 /evaluation /flickr30k_evaluation.py

jwyang

first commit

4121bec almost 2 years ago

No virus

15.1 kB

	import logging
	import numpy as np
	import os
	from collections import OrderedDict
	from detectron2.config import global_cfg as cfg
	import torch
	from fvcore.common.file_io import PathManager
	from detectron2.structures.boxes import pairwise_iou

	from detectron2.utils.comm import all_gather, is_main_process, synchronize
	import pickle
	from .evaluator import DatasetEvaluator
	import json
	from detectron2.structures import Boxes
	import html
	import ftfy
	import regex as re

	PATTN = re.compile(r"""<\\|startoftext\\|>\|<\\|endoftext\\|>\|'s\|'t\|'re\|'ve\|'m\|'ll\|'d\|[\p{L}]+\|[\p{N}]\|[^\s\p{L}\p{N}]+""", re.IGNORECASE)

	def basic_clean(text):
	text = ftfy.fix_text(text)
	text = html.unescape(html.unescape(text))
	return text.strip()

	def whitespace_clean(text):
	text = re.sub(r'\s+', ' ', text)
	text = text.strip()
	return text


	class FLICKR30KEvaluator(DatasetEvaluator):

	"""
	Evaluate semantic segmentation
	"""

	def __init__(self, dataset_name, distributed=True, output_dir=None):
	"""
	Args:
	dataset_name (str): name of the dataset to be evaluated.
	distributed (True): if True, will collect results from all ranks for evaluation.
	Otherwise, will evaluate the results in the current process.
	num_classes (int): number of classes
	ignore_label (int): value in semantic segmentation ground truth. Predictions for the
	corresponding pixels should be ignored.
	output_dir (str): an output directory to dump results.
	"""
	self._dataset_name = dataset_name
	self._distributed = distributed
	self._output_dir = output_dir

	self._cpu_device = torch.device("cpu")
	self._logger = logging.getLogger(__name__)
	self.gt_boxes = json.load(open("/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/bounding_boxes_test.json"))
	self.gt_sents = json.load(open("/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/sentences_test.json"))

	def reset(self):
	self._predictions = {}

	def process(self, inputs, outputs):
	"""
	Args:
	inputs: the inputs to a model.
	It is a list of dicts. Each dict corresponds to an image and
	contains keys like "height", "width", "file_name", "image_id".
	outputs: the outputs of a model. It is either list of semantic segmentation predictions
	(Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
	segmentation prediction in the same format.
	"""
	assert len(inputs) == 1 # batch = 1 during inference
	dataset_name, img_id, (img_height, img_width), all_str2id_links = inputs[0][-1]
	img_id = img_id.split('/')[-1]
	match_scores, processed_results = outputs
	match_scores = match_scores.to(self._cpu_device)
	pred_boxes = processed_results[0]['instances'].proposal_boxes.to(self._cpu_device)

	self._predictions.update({img_id: [img_height, img_width, all_str2id_links, match_scores, pred_boxes]})

	def merge_gt_boxes(self, box_anno):
	gt_boxes = []
	phrase_ids = []
	scene_box_ids = box_anno['scene']
	for k, v in box_anno['boxes'].items():
	if k in scene_box_ids: # important: remove scene boxes, otherwise the number of each phrase type cannot match paper
	continue
	phrase_ids.append(k)
	if len(v) == 1:
	gt_boxes.append(v[0])
	else:
	# when a phrase respond to multiple regions, we take the union of them as paper given
	v = np.array(v)
	box = [v[:, 0].min(), v[:, 1].min(), v[:, 2].max(), v[:, 3].max()]
	gt_boxes.append(box)
	gt_boxes = np.array(gt_boxes)
	return phrase_ids, gt_boxes

	def find_ground_box(self, match_scores, all_str2id_links, sentences, gt_phrase_ids):
	""" Given matching matrix between region feats and token feats, find the box that grounds a phrase
	"""
	num_box = match_scores.size(0)
	num_cap = int(match_scores.size(1) / 77)
	all_phrase_score = []
	all_phrase_ids = []
	for i in range(num_cap): # per sentence
	this_score = match_scores[:, i77:(i+1)77] # [#boxes, 77]
	input_ids = [iitem for item in all_str2id_links[i] for iitem in item[1]]
	input_tokens = [item[0] for item in all_str2id_links[i]]
	phrases = sentences[i]['phrases']
	for j, phrase in enumerate(phrases): # per phrase
	if phrase['phrase_id'] not in gt_phrase_ids: # no gt box for this phrase, skip
	continue
	# locate the word
	words = whitespace_clean(basic_clean(phrase['phrase'])).lower() # phrase['phrase'].lower().replace("-"," ").split()
	words = re.findall(PATTN, words)
	first_word_index = None # phrase['first_word_index']
	for idx in range(len(input_tokens) - len(words) + 1): # search start word of this phrase
	if input_tokens[idx : idx + len(words)] == words: # NOTE: key step for alignment btw model prediction and annotation
	first_word_index = idx
	break
	if first_word_index is None:
	print("Fail to find phrase [{}] in input tokens [{}]".format(words, input_tokens))
	start_wd_ind = first_word_index
	end_wd_ind = first_word_index + len(words)
	if len(words) != len(phrase['phrase'].split()):
	pass # print('tokens: {} <--> phrase: {}'.format(words, phrase['phrase']))
	# locate the token
	start_tk_ind = 0
	for k_i, k in enumerate(range(0, start_wd_ind)):
	start_tk_ind += len(all_str2id_links[i][k][1])
	token_cnt = 0
	for k_i, k in enumerate(range(start_wd_ind, end_wd_ind)):
	if all_str2id_links[i][k][0] != words[k_i]:
	print("Word not matched: {} in model output but {} in annotation".format(all_str2id_links[i][k][0], words[k_i]))
	else:
	token_cnt += len(all_str2id_links[i][k][1]) # ith sentence, kth word, and its tokens
	end_tk_ind = start_tk_ind + token_cnt
	# sanity check
	phrase_ids1 = [iitem for item in all_str2id_links[i][start_wd_ind:end_wd_ind] for iitem in item[1]] # way 1: use word index to accumulate token ids in a phrase
	phrase_ids2 = input_ids[start_tk_ind:end_tk_ind] # way 2: use token index to directly index token ids in a phrase
	if phrase_ids1 != phrase_ids2:
	print("Santity check: {} from word {} in token".format(phrase_ids1, phrase_ids2))
	# index similarity score
	phrase_score = this_score[:, start_tk_ind:end_tk_ind]
	phrase_score = phrase_score.mean(dim=1) # phrase_score.max(dim=1)[0] #
	all_phrase_score.append(phrase_score)
	all_phrase_ids.append(phrase['phrase_id'])
	phrase_score_tensor = torch.cat(all_phrase_score)
	phrase_score_tensor = phrase_score_tensor.view(len(all_phrase_ids), num_box) # NOTE: this should be [#phrases, #object proposals]

	return phrase_score_tensor, all_phrase_ids

	def evaluate(self):
	"""
	Evaluates Referring Segmentation IoU:
	"""

	if self._distributed:
	synchronize()

	self._predictions = all_gather(self._predictions)

	if not is_main_process():
	return

	all_prediction = {}
	for p in self._predictions:
	all_prediction.update(p)
	else:
	all_prediction = self._predictions

	if len(all_prediction) < 30: # resume inference results
	save_path = "/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/grounding_results/grounding_{}_imgs.npy".format(1000)
	all_prediction = np.load(save_path, allow_pickle=True).tolist()
	self._logger.info('Resume from {}'.format(save_path))
	else: # new run
	save_path = "/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/grounding_results/grounding_{}_imgs.npy".format(len(all_prediction))
	np.save(save_path, all_prediction)
	self._logger.info('Save results to {}'.format(save_path))
	self._logger.info('Got {} images!'.format(len(all_prediction)))

	image_unique_ids = list(all_prediction.keys())
	image_evaled = []

	total_num = 0
	recall_num = 0
	num_type = {}
	recall_type = {}
	acc_type = {}
	recall_topk_num = {5:0, 10:0}
	point_recall_num = 0
	EVAL_THRESH = 0.5
	type_cnts = {}

	for img_sent_id in image_unique_ids:
	if img_sent_id not in self.gt_boxes:
	continue
	else:
	image_evaled.append(img_sent_id)
	# results from model
	result = all_prediction[img_sent_id]
	phrase_ids = None
	phrase_types = [] # phrase type: each phrase belongs to a coarse object concept
	pred_boxes = None # an object proposal selected by model for each phrase
	img_height, img_width, all_str2id_links = result[0], result[1], result[2] # all_str2id_links: each word and its tokenized ids
	match_scores = result[3] # matching score [#object proposals, #tokens]
	precomp_boxes = result[4] # object proposals from offline module
	# annotation from dataset
	sentences = self.gt_sents[img_sent_id]
	box_anno = self.gt_boxes[img_sent_id]
	# sanity check and box merging
	assert box_anno['height'] == img_height, box_anno['width'] == img_width
	gt_phrase_ids, gt_boxes = self.merge_gt_boxes(box_anno) # merged if multiple boxes for the same phrase
	if len(gt_phrase_ids) == 0: # no gt box for this image
	continue
	for sent_item in sentences:
	for phrase_item in sent_item['phrases']:
	if phrase_item['phrase_id'] in gt_phrase_ids:
	phrase_types.append(phrase_item['phrase_type'])

	# merge similarity scores from token level to phrase level, and find the box that grounds the phrase
	phrase_score_tensor, all_phrase_ids = self.find_ground_box(match_scores, all_str2id_links, sentences, gt_phrase_ids)
	pred_boxes_ind = torch.argmax(phrase_score_tensor, dim=1)
	pred_boxes = precomp_boxes[pred_boxes_ind]
	pred_similarity = phrase_score_tensor # .t() # pred_similarity: matching score [#phrases, #object proposals]

	# get single target/gt box for each phrase
	# 1. any gt box that can be matched as target
	# refer to (https://github.com/BigRedT/info-ground/blob/22ae6d6ec8b38df473e73034fc895ebf97d39897/exp/ground/eval_flickr_phrase_loc.py#L90)
	phrase_boxes = [box_anno['boxes'][p_id] for p_id in all_phrase_ids]
	targets = []
	for pr_b, pd_b in zip(phrase_boxes, pred_boxes):
	matched = False
	for single_b in pr_b:
	this_iou = pairwise_iou(Boxes(torch.from_numpy(np.array([single_b])).float()), Boxes(pd_b.view(1,-1)))
	if (this_iou >= EVAL_THRESH).sum() > 0:
	targets.append(single_b)
	matched = True
	break
	if not matched:
	targets.append(single_b)
	targets = Boxes(torch.from_numpy(np.array(targets)).float())
	# 2. union box as target
	# target_ind = np.array([gt_phrase_ids.index(p_id) for p_id in all_phrase_ids])
	# targets = gt_boxes[target_ind] # ground-truth boxes for each phrase in each sentence
	# targets = Boxes(torch.from_numpy(targets).float())
	assert len(phrase_types) == len(targets)

	# single predicted box for each phrase
	ious = pairwise_iou(targets, pred_boxes) # this function will change the target_boxes into cuda mode
	iou = ious.numpy().diagonal()
	total_num += iou.shape[0]
	recall_num += int((iou >= EVAL_THRESH).sum()) # 0.5

	# metric of point (can be ignored)
	pred_boxes_tensor = pred_boxes.tensor
	pred_center = (pred_boxes_tensor[:, :2] + pred_boxes_tensor[:, 2:]) / 2.0
	pred_center = pred_center.repeat(1, 2) ## x_c, y_c, x_c, y_c
	targets_tensor = targets.tensor
	fall_tensor = targets_tensor - pred_center
	fall_tensor = (fall_tensor[:, :2] <= 0).float().sum(1) + (fall_tensor[:, 2:] >= 0).float().sum(1)
	point_recall_num += (fall_tensor == 4).float().numpy().sum()

	# detailed accuracy across different phrase types
	for pid, p_type in enumerate(phrase_types):
	p_type = p_type[0]
	num_type[p_type] = num_type.setdefault(p_type, 0) + 1
	recall_type[p_type] = recall_type.setdefault(p_type, 0) + (iou[pid] >= EVAL_THRESH)

	# metric of recall when multiple predicted boxes for each phrase
	ious_top = pairwise_iou(targets, precomp_boxes).cpu()
	for k in [5, 10]:
	top_k = torch.topk(pred_similarity, k=k, dim=1)[0][:, [-1]]
	pred_similarity_topk = (pred_similarity >= top_k).float()
	ious_top_k = (ious_top * pred_similarity_topk).numpy()
	recall_topk_num[k] += int(((ious_top_k >= EVAL_THRESH).sum(1) > 0).sum())

	acc = recall_num / total_num
	acc_top5 = recall_topk_num[5] / total_num
	acc_top10 = recall_topk_num[10] / total_num
	point_acc = point_recall_num / total_num

	# details about each coarse type of phrase
	for type, type_num in num_type.items():
	acc_type[type] = recall_type[type] / type_num

	# if self._output_dir:
	# PathManager.mkdirs(self._output_dir)
	# file_path = os.path.join(self._output_dir, "prediction_{}.pkl".format(str(acc).replace('.', '_')[:6]))
	# with PathManager.open(file_path, "wb") as f:
	# pickle.dump(all_prediction, f)

	del all_prediction
	self._logger.info('evaluation on {} expression instances, detailed_iou: {}'.format(len(image_evaled), acc_type))
	self._logger.info('Evaluate Pointing Accuracy: PointAcc:{}'.format(point_acc))
	results = OrderedDict({"acc": acc, "acc_top5": acc_top5, "acc_top10": acc_top10})
	self._logger.info(results)
	self._logger.info(num_type)
	return results