import logging import numpy as np import os from collections import OrderedDict from detectron2.config import global_cfg as cfg import torch from fvcore.common.file_io import PathManager from detectron2.structures.boxes import pairwise_iou from detectron2.utils.comm import all_gather, is_main_process, synchronize import pickle from .evaluator import DatasetEvaluator import json from detectron2.structures import Boxes import html import ftfy import regex as re PATTN = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) def basic_clean(text): text = ftfy.fix_text(text) text = html.unescape(html.unescape(text)) return text.strip() def whitespace_clean(text): text = re.sub(r'\s+', ' ', text) text = text.strip() return text class FLICKR30KEvaluator(DatasetEvaluator): """ Evaluate semantic segmentation """ def __init__(self, dataset_name, distributed=True, output_dir=None): """ Args: dataset_name (str): name of the dataset to be evaluated. distributed (True): if True, will collect results from all ranks for evaluation. Otherwise, will evaluate the results in the current process. num_classes (int): number of classes ignore_label (int): value in semantic segmentation ground truth. Predictions for the corresponding pixels should be ignored. output_dir (str): an output directory to dump results. """ self._dataset_name = dataset_name self._distributed = distributed self._output_dir = output_dir self._cpu_device = torch.device("cpu") self._logger = logging.getLogger(__name__) self.gt_boxes = json.load(open("/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/bounding_boxes_test.json")) self.gt_sents = json.load(open("/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/sentences_test.json")) def reset(self): self._predictions = {} def process(self, inputs, outputs): """ Args: inputs: the inputs to a model. It is a list of dicts. Each dict corresponds to an image and contains keys like "height", "width", "file_name", "image_id". outputs: the outputs of a model. It is either list of semantic segmentation predictions (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic segmentation prediction in the same format. """ assert len(inputs) == 1 # batch = 1 during inference dataset_name, img_id, (img_height, img_width), all_str2id_links = inputs[0][-1] img_id = img_id.split('/')[-1] match_scores, processed_results = outputs match_scores = match_scores.to(self._cpu_device) pred_boxes = processed_results[0]['instances'].proposal_boxes.to(self._cpu_device) self._predictions.update({img_id: [img_height, img_width, all_str2id_links, match_scores, pred_boxes]}) def merge_gt_boxes(self, box_anno): gt_boxes = [] phrase_ids = [] scene_box_ids = box_anno['scene'] for k, v in box_anno['boxes'].items(): if k in scene_box_ids: # important: remove scene boxes, otherwise the number of each phrase type cannot match paper continue phrase_ids.append(k) if len(v) == 1: gt_boxes.append(v[0]) else: # when a phrase respond to multiple regions, we take the union of them as paper given v = np.array(v) box = [v[:, 0].min(), v[:, 1].min(), v[:, 2].max(), v[:, 3].max()] gt_boxes.append(box) gt_boxes = np.array(gt_boxes) return phrase_ids, gt_boxes def find_ground_box(self, match_scores, all_str2id_links, sentences, gt_phrase_ids): """ Given matching matrix between region feats and token feats, find the box that grounds a phrase """ num_box = match_scores.size(0) num_cap = int(match_scores.size(1) / 77) all_phrase_score = [] all_phrase_ids = [] for i in range(num_cap): # per sentence this_score = match_scores[:, i*77:(i+1)*77] # [#boxes, 77] input_ids = [iitem for item in all_str2id_links[i] for iitem in item[1]] input_tokens = [item[0] for item in all_str2id_links[i]] phrases = sentences[i]['phrases'] for j, phrase in enumerate(phrases): # per phrase if phrase['phrase_id'] not in gt_phrase_ids: # no gt box for this phrase, skip continue # locate the word words = whitespace_clean(basic_clean(phrase['phrase'])).lower() # phrase['phrase'].lower().replace("-"," ").split() words = re.findall(PATTN, words) first_word_index = None # phrase['first_word_index'] for idx in range(len(input_tokens) - len(words) + 1): # search start word of this phrase if input_tokens[idx : idx + len(words)] == words: # NOTE: key step for alignment btw model prediction and annotation first_word_index = idx break if first_word_index is None: print("Fail to find phrase [{}] in input tokens [{}]".format(words, input_tokens)) start_wd_ind = first_word_index end_wd_ind = first_word_index + len(words) if len(words) != len(phrase['phrase'].split()): pass # print('tokens: {} <--> phrase: {}'.format(words, phrase['phrase'])) # locate the token start_tk_ind = 0 for k_i, k in enumerate(range(0, start_wd_ind)): start_tk_ind += len(all_str2id_links[i][k][1]) token_cnt = 0 for k_i, k in enumerate(range(start_wd_ind, end_wd_ind)): if all_str2id_links[i][k][0] != words[k_i]: print("Word not matched: {} in model output but {} in annotation".format(all_str2id_links[i][k][0], words[k_i])) else: token_cnt += len(all_str2id_links[i][k][1]) # ith sentence, kth word, and its tokens end_tk_ind = start_tk_ind + token_cnt # sanity check phrase_ids1 = [iitem for item in all_str2id_links[i][start_wd_ind:end_wd_ind] for iitem in item[1]] # way 1: use word index to accumulate token ids in a phrase phrase_ids2 = input_ids[start_tk_ind:end_tk_ind] # way 2: use token index to directly index token ids in a phrase if phrase_ids1 != phrase_ids2: print("Santity check: {} from word {} in token".format(phrase_ids1, phrase_ids2)) # index similarity score phrase_score = this_score[:, start_tk_ind:end_tk_ind] phrase_score = phrase_score.mean(dim=1) # phrase_score.max(dim=1)[0] # all_phrase_score.append(phrase_score) all_phrase_ids.append(phrase['phrase_id']) phrase_score_tensor = torch.cat(all_phrase_score) phrase_score_tensor = phrase_score_tensor.view(len(all_phrase_ids), num_box) # NOTE: this should be [#phrases, #object proposals] return phrase_score_tensor, all_phrase_ids def evaluate(self): """ Evaluates Referring Segmentation IoU: """ if self._distributed: synchronize() self._predictions = all_gather(self._predictions) if not is_main_process(): return all_prediction = {} for p in self._predictions: all_prediction.update(p) else: all_prediction = self._predictions if len(all_prediction) < 30: # resume inference results save_path = "/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/grounding_results/grounding_{}_imgs.npy".format(1000) all_prediction = np.load(save_path, allow_pickle=True).tolist() self._logger.info('Resume from {}'.format(save_path)) else: # new run save_path = "/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/grounding_results/grounding_{}_imgs.npy".format(len(all_prediction)) np.save(save_path, all_prediction) self._logger.info('Save results to {}'.format(save_path)) self._logger.info('Got {} images!'.format(len(all_prediction))) image_unique_ids = list(all_prediction.keys()) image_evaled = [] total_num = 0 recall_num = 0 num_type = {} recall_type = {} acc_type = {} recall_topk_num = {5:0, 10:0} point_recall_num = 0 EVAL_THRESH = 0.5 type_cnts = {} for img_sent_id in image_unique_ids: if img_sent_id not in self.gt_boxes: continue else: image_evaled.append(img_sent_id) # results from model result = all_prediction[img_sent_id] phrase_ids = None phrase_types = [] # phrase type: each phrase belongs to a coarse object concept pred_boxes = None # an object proposal selected by model for each phrase img_height, img_width, all_str2id_links = result[0], result[1], result[2] # all_str2id_links: each word and its tokenized ids match_scores = result[3] # matching score [#object proposals, #tokens] precomp_boxes = result[4] # object proposals from offline module # annotation from dataset sentences = self.gt_sents[img_sent_id] box_anno = self.gt_boxes[img_sent_id] # sanity check and box merging assert box_anno['height'] == img_height, box_anno['width'] == img_width gt_phrase_ids, gt_boxes = self.merge_gt_boxes(box_anno) # merged if multiple boxes for the same phrase if len(gt_phrase_ids) == 0: # no gt box for this image continue for sent_item in sentences: for phrase_item in sent_item['phrases']: if phrase_item['phrase_id'] in gt_phrase_ids: phrase_types.append(phrase_item['phrase_type']) # merge similarity scores from token level to phrase level, and find the box that grounds the phrase phrase_score_tensor, all_phrase_ids = self.find_ground_box(match_scores, all_str2id_links, sentences, gt_phrase_ids) pred_boxes_ind = torch.argmax(phrase_score_tensor, dim=1) pred_boxes = precomp_boxes[pred_boxes_ind] pred_similarity = phrase_score_tensor # .t() # pred_similarity: matching score [#phrases, #object proposals] # get single target/gt box for each phrase # 1. any gt box that can be matched as target # refer to (https://github.com/BigRedT/info-ground/blob/22ae6d6ec8b38df473e73034fc895ebf97d39897/exp/ground/eval_flickr_phrase_loc.py#L90) phrase_boxes = [box_anno['boxes'][p_id] for p_id in all_phrase_ids] targets = [] for pr_b, pd_b in zip(phrase_boxes, pred_boxes): matched = False for single_b in pr_b: this_iou = pairwise_iou(Boxes(torch.from_numpy(np.array([single_b])).float()), Boxes(pd_b.view(1,-1))) if (this_iou >= EVAL_THRESH).sum() > 0: targets.append(single_b) matched = True break if not matched: targets.append(single_b) targets = Boxes(torch.from_numpy(np.array(targets)).float()) # 2. union box as target # target_ind = np.array([gt_phrase_ids.index(p_id) for p_id in all_phrase_ids]) # targets = gt_boxes[target_ind] # ground-truth boxes for each phrase in each sentence # targets = Boxes(torch.from_numpy(targets).float()) assert len(phrase_types) == len(targets) # single predicted box for each phrase ious = pairwise_iou(targets, pred_boxes) # this function will change the target_boxes into cuda mode iou = ious.numpy().diagonal() total_num += iou.shape[0] recall_num += int((iou >= EVAL_THRESH).sum()) # 0.5 # metric of point (can be ignored) pred_boxes_tensor = pred_boxes.tensor pred_center = (pred_boxes_tensor[:, :2] + pred_boxes_tensor[:, 2:]) / 2.0 pred_center = pred_center.repeat(1, 2) ## x_c, y_c, x_c, y_c targets_tensor = targets.tensor fall_tensor = targets_tensor - pred_center fall_tensor = (fall_tensor[:, :2] <= 0).float().sum(1) + (fall_tensor[:, 2:] >= 0).float().sum(1) point_recall_num += (fall_tensor == 4).float().numpy().sum() # detailed accuracy across different phrase types for pid, p_type in enumerate(phrase_types): p_type = p_type[0] num_type[p_type] = num_type.setdefault(p_type, 0) + 1 recall_type[p_type] = recall_type.setdefault(p_type, 0) + (iou[pid] >= EVAL_THRESH) # metric of recall when multiple predicted boxes for each phrase ious_top = pairwise_iou(targets, precomp_boxes).cpu() for k in [5, 10]: top_k = torch.topk(pred_similarity, k=k, dim=1)[0][:, [-1]] pred_similarity_topk = (pred_similarity >= top_k).float() ious_top_k = (ious_top * pred_similarity_topk).numpy() recall_topk_num[k] += int(((ious_top_k >= EVAL_THRESH).sum(1) > 0).sum()) acc = recall_num / total_num acc_top5 = recall_topk_num[5] / total_num acc_top10 = recall_topk_num[10] / total_num point_acc = point_recall_num / total_num # details about each coarse type of phrase for type, type_num in num_type.items(): acc_type[type] = recall_type[type] / type_num # if self._output_dir: # PathManager.mkdirs(self._output_dir) # file_path = os.path.join(self._output_dir, "prediction_{}.pkl".format(str(acc).replace('.', '_')[:6])) # with PathManager.open(file_path, "wb") as f: # pickle.dump(all_prediction, f) del all_prediction self._logger.info('evaluation on {} expression instances, detailed_iou: {}'.format(len(image_evaled), acc_type)) self._logger.info('Evaluate Pointing Accuracy: PointAcc:{}'.format(point_acc)) results = OrderedDict({"acc": acc, "acc_top5": acc_top5, "acc_top10": acc_top10}) self._logger.info(results) self._logger.info(num_type) return results