# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import json class Metric(object): def __init__(self, config, metric_names): self.metric_names = metric_names def best_metric(self, metric): return metric[self.metric_names[0]] def save_metrics(self, fn, metrics): with open(fn, "w") as fw: json.dump(fw, metrics) def print_computed_metrics(self, metrics): raise NotImplementedError class RetrievalMetric(Metric): """ this is modified from `howto100m/metrics.py`. History of changes: refactor as a class. add metric_key in __init__ """ def __init__(self, config, metric_names=["R1", "R5", "R10", "MR"]): super().__init__(config, metric_names) self.error = False # TODO(huxu): add to config to print error. def compute_metrics(self, outputs, texts, **kwargs): x = outputs sx = np.sort(-x, axis=1) d = np.diag(-x) d = d[:, np.newaxis] ind = sx - d ind = np.where(ind == 0) ind = ind[1] metrics = {} metrics["R1"] = float(np.sum(ind == 0)) / len(ind) metrics["R5"] = float(np.sum(ind < 5)) / len(ind) metrics["R10"] = float(np.sum(ind < 10)) / len(ind) metrics["MR"] = np.median(ind) + 1 max_idx = np.argmax(outputs, axis=1) if self.error: # print top-20 errors. error = [] for ex_idx in range(20): error.append((texts[ex_idx], texts[max_idx[ex_idx]])) metrics["error"] = error return metrics def print_computed_metrics(self, metrics): r1 = metrics["R1"] r5 = metrics["R5"] r10 = metrics["R10"] mr = metrics["MR"] print( "R@1: {:.4f} - R@5: {:.4f} - R@10: {:.4f} - Median R: {}".format( r1, r5, r10, mr ) ) if "error" in metrics: print(metrics["error"]) class DiDeMoMetric(Metric): """ History of changes: python 2.x to python 3.x. merge utils.py into eval to save one file. reference: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py Code to evaluate your results on the DiDeMo dataset. """ def __init__(self, config, metric_names=["rank1", "rank5", "miou"]): super().__init__(config, metric_names) def compute_metrics(self, outputs, targets, **kwargs): assert len(outputs) == len(targets) rank1, rank5, miou = self._eval_predictions(outputs, targets) metrics = { "rank1": rank1, "rank5": rank5, "miou": miou } return metrics def print_computed_metrics(self, metrics): rank1 = metrics["rank1"] rank5 = metrics["rank5"] miou = metrics["miou"] # print("Average rank@1: %f" % rank1) # print("Average rank@5: %f" % rank5) # print("Average iou: %f" % miou) print( "Average rank@1: {:.4f} Average rank@5: {:.4f} Average iou: {:.4f}".format( rank1, rank5, miou ) ) def _iou(self, pred, gt): intersection = max(0, min(pred[1], gt[1]) + 1 - max(pred[0], gt[0])) union = max(pred[1], gt[1]) + 1 - min(pred[0], gt[0]) return float(intersection)/union def _rank(self, pred, gt): return pred.index(tuple(gt)) + 1 def _eval_predictions(self, segments, data): ''' Inputs: segments: For each item in the ground truth data, rank possible video segments given the description and video. In DiDeMo, there are 21 posible moments extracted for each video so the list of video segments will be of length 21. The first video segment should be the video segment that best corresponds to the text query. There are 4180 sentence in the validation data, so when evaluating a model on the val dataset, segments should be a list of lenght 4180, and each item in segments should be a list of length 21. data: ground truth data ''' average_ranks = [] average_iou = [] for s, d in zip(segments, data): pred = s[0] ious = [self._iou(pred, t) for t in d['times']] average_iou.append(np.mean(np.sort(ious)[-3:])) ranks = [self._rank(s, t) for t in d['times'] if tuple(t) in s] # if t in s] is added for s, e not in prediction. average_ranks.append(np.mean(np.sort(ranks)[:3])) rank1 = np.sum(np.array(average_ranks) <= 1)/float(len(average_ranks)) rank5 = np.sum(np.array(average_ranks) <= 5)/float(len(average_ranks)) miou = np.mean(average_iou) # print("Average rank@1: %f" % rank1) # print("Average rank@5: %f" % rank5) # print("Average iou: %f" % miou) return rank1, rank5, miou class NLGMetric(Metric): def __init__( self, config, metric_names=[ "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "METEOR", "ROUGE_L", "CIDEr" ] ): super().__init__(config, metric_names) # please install NLGEval from `https://github.com/Maluuba/nlg-eval` from nlgeval import NLGEval self.nlg = NLGEval() def compute_metrics(self, outputs, targets, **kwargs): return self.nlg.compute_metrics( hyp_list=outputs, ref_list=targets) def print_computed_metrics(self, metrics): Bleu_1 = metrics["Bleu_1"] Bleu_2 = metrics["Bleu_2"] Bleu_3 = metrics["Bleu_3"] Bleu_4 = metrics["Bleu_4"] METEOR = metrics["METEOR"] ROUGE_L = metrics["ROUGE_L"] CIDEr = metrics["CIDEr"] print( "Bleu_1: {:.4f} - Bleu_2: {:.4f} - Bleu_3: {:.4f} - Bleu_4: {:.4f} - METEOR: {:.4f} - ROUGE_L: {:.4f} - CIDEr: {:.4f}".format( Bleu_1, Bleu_2, Bleu_3, Bleu_4, METEOR, ROUGE_L, CIDEr ) ) class QAMetric(Metric): def __init__( self, config, metric_names=["acc"] ): super().__init__(config, metric_names) def compute_metrics(self, outputs, targets, **kwargs): from sklearn.metrics import accuracy_score return {"acc": accuracy_score(targets, outputs)} def print_computed_metrics(self, metrics): print("acc: {:.4f}".format(metrics["acc"])) class COINActionSegmentationMetric(Metric): """ COIN dataset listed 3 repos for Action Segmentation. Action Sets, NeuralNetwork-Viterbi, TCFPN-ISBA. The first and second are the same. https://github.com/alexanderrichard/action-sets/blob/master/eval.py Future reference for the third: `https://github.com/Zephyr-D/TCFPN-ISBA/blob/master/utils/metrics.py` """ def __init__(self, config, metric_name=["frame_acc"]): super().__init__(config, metric_name) def compute_metrics(self, outputs, targets): n_frames = 0 n_errors = 0 n_errors = sum(outputs != targets) n_frames = len(targets) return {"frame_acc": 1.0 - float(n_errors) / n_frames} def print_computed_metrics(self, metrics): fa = metrics["frame_acc"] print("frame accuracy:", fa) class CrossTaskMetric(Metric): def __init__(self, config, metric_names=["recall"]): super().__init__(config, metric_names) def compute_metrics(self, outputs, targets, **kwargs): """refactored from line 166: https://github.com/DmZhukov/CrossTask/blob/master/train.py""" recalls = self._get_recalls(Y_true=targets, Y_pred=outputs) results = {} for task, rec in recalls.items(): results[str(task)] = rec avg_recall = np.mean(list(recalls.values())) results["recall"] = avg_recall return results def print_computed_metrics(self, metrics): print('Recall: {0:0.3f}'.format(metrics["recall"])) for task in metrics: if task != "recall": print('Task {0}. Recall = {1:0.3f}'.format( task, metrics[task])) def _get_recalls(self, Y_true, Y_pred): """refactored from https://github.com/DmZhukov/CrossTask/blob/master/train.py""" step_match = {task: 0 for task in Y_true.keys()} step_total = {task: 0 for task in Y_true.keys()} for task, ys_true in Y_true.items(): ys_pred = Y_pred[task] for vid in set(ys_pred.keys()).intersection(set(ys_true.keys())): y_true = ys_true[vid] y_pred = ys_pred[vid] step_total[task] += (y_true.sum(axis=0) > 0).sum() step_match[task] += (y_true*y_pred).sum() recalls = { task: step_match[task] / n for task, n in step_total.items()} return recalls class ActionRecognitionMetric(Metric): def __init__( self, config, metric_names=["acc", "acc_splits", "r1_splits", "r5_splits", "r10_splits"] ): super().__init__(config, metric_names) def compute_metrics(self, outputs, targets, splits, **kwargs): all_video_embd = outputs labels = targets split1, split2, split3 = splits accs = [] r1s = [] r5s = [] r10s = [] for split in range(3): if split == 0: s = split1 elif split == 1: s = split2 else: s = split3 X_pred = all_video_embd[np.where(s == 2)[0]] label_test = labels[np.where(s == 2)[0]] logits = X_pred X_pred = np.argmax(X_pred, axis=1) acc = np.sum(X_pred == label_test) / float(len(X_pred)) accs.append(acc) # compute recall. sorted_pred = (-logits).argsort(axis=-1) label_test_sp = label_test.reshape(-1, 1) r1 = np.mean((sorted_pred[:, :1] == label_test_sp).sum(axis=1), axis=0) r5 = np.mean((sorted_pred[:, :5] == label_test_sp).sum(axis=1), axis=0) r10 = np.mean((sorted_pred[:, :10] == label_test_sp).sum(axis=1), axis=0) r1s.append(r1) r5s.append(r5) r10s.append(r10) return {"acc": accs[0], "acc_splits": accs, "r1_splits": r1s, "r5_splits": r5s, "r10_splits": r10s} def print_computed_metrics(self, metrics): for split, acc in enumerate(metrics["acc_splits"]): print("Top 1 accuracy on split {}: {}; r1 {}; r5 {}; r10 {}".format( split + 1, acc, metrics["r1_splits"][split], metrics["r5_splits"][split], metrics["r10_splits"][split], ) )