eP-ALM / refTools /evaluation /refEvaluation.py
mshukor
init
3eb682b
raw
history blame
4.32 kB
from refTools.evaluation.tokenizer.ptbtokenizer import PTBTokenizer
from refTools.evaluation.bleu.bleu import Bleu
from refTools.evaluation.meteor.meteor import Meteor
from refTools.evaluation.rouge.rouge import Rouge
from refTools.evaluation.cider.cider import Cider
"""
Input: refer and Res = [{ref_id, sent}]
Things of interest
evalRefs - list of ['ref_id', 'CIDEr', 'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'ROUGE_L', 'METEOR']
eval - dict of {metric: score}
refToEval - dict of {ref_id: ['ref_id', 'CIDEr', 'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'ROUGE_L', 'METEOR']}
"""
class RefEvaluation:
def __init__ (self, refer, Res):
"""
:param refer: refer class of current dataset
:param Res: [{'ref_id', 'sent'}]
"""
self.evalRefs = []
self.eval = {}
self.refToEval = {}
self.refer = refer
self.Res = Res
def evaluate(self):
evalRefIds = [ann['ref_id'] for ann in self.Res]
refToGts = {}
for ref_id in evalRefIds:
ref = self.refer.Refs[ref_id]
gt_sents = [sent['sent'].encode('ascii', 'ignore').decode('ascii') for sent in ref['sentences']] # up to 3 expressions
refToGts[ref_id] = gt_sents
refToRes = {ann['ref_id']: [ann['sent']] for ann in self.Res}
print('tokenization...')
tokenizer = PTBTokenizer()
self.refToRes = tokenizer.tokenize(refToRes)
self.refToGts = tokenizer.tokenize(refToGts)
# =================================================
# Set up scorers
# =================================================
print('setting up scorers...')
scorers = [
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
(Meteor(),"METEOR"),
(Rouge(), "ROUGE_L"),
(Cider(), "CIDEr")
]
# =================================================
# Compute scores
# =================================================
for scorer, method in scorers:
print('computing %s score...'%(scorer.method()))
score, scores = scorer.compute_score(self.refToGts, self.refToRes)
if type(method) == list:
for sc, scs, m in zip(score, scores, method):
self.setEval(sc, m)
self.setRefToEvalRefs(scs, self.refToGts.keys(), m)
print("%s: %0.3f"%(m, sc))
else:
self.setEval(score, method)
self.setRefToEvalRefs(scores, self.refToGts.keys(), method)
print("%s: %0.3f"%(method, score))
self.setEvalRefs()
def setEval(self, score, method):
self.eval[method] = score
def setRefToEvalRefs(self, scores, refIds, method):
for refId, score in zip(refIds, scores):
if not refId in self.refToEval:
self.refToEval[refId] = {}
self.refToEval[refId]["ref_id"] = refId
self.refToEval[refId][method] = score
def setEvalRefs(self):
self.evalRefs = [eval for refId, eval in self.refToEval.items()]
if __name__ == '__main__':
import os.path as osp
import sys
ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..'))
sys.path.insert(0, osp.join(ROOT_DIR, 'lib', 'datasets'))
from refer import REFER
# load refer of dataset
dataset = 'refcoco'
refer = REFER(dataset, splitBy = 'google')
# mimic some Res
val_refIds = refer.getRefIds(split='test')
ref_id = 49767
print("GD: %s" % refer.Refs[ref_id]['sentences'])
Res = [{'ref_id': ref_id, 'sent': 'left bottle'}]
# evaluate some refer expressions
refEval = RefEvaluation(refer, Res)
refEval.evaluate()
# print output evaluation scores
for metric, score in refEval.eval.items():
print('%s: %.3f'%(metric, score))
# demo how to use evalImgs to retrieve low score result
# evals = [eva for eva in refEval.evalRefs if eva['CIDEr']<30]
# print 'ground truth sents'
# refId = evals[0]['ref_id']
# print 'refId: %s' % refId
# print [sent['sent'] for sent in refer.Refs[refId]['sentences']]
#
# print 'generated sent (CIDEr score %0.1f)' % (evals[0]['CIDEr'])
# print refEval.refToEval[8]