|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import evaluate |
|
import datasets |
|
from collections import Counter |
|
from math import log, exp |
|
from random import seed, randint |
|
from numpy import mean, std, round |
|
|
|
|
|
|
|
_CITATION = """\ |
|
@InProceedings{napoles-EtAl:2015:ACL-IJCNLP, |
|
author = {Napoles, Courtney and Sakaguchi, Keisuke and Post, Matt and Tetreault, Joel}, |
|
title = {Ground Truth for Grammatical Error Correction Metrics}, |
|
booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)}, |
|
month = {July}, |
|
year = {2015}, |
|
address = {Beijing, China}, |
|
publisher = {Association for Computational Linguistics}, |
|
pages = {588--593}, |
|
url = {http://www.aclweb.org/anthology/P15-2097} |
|
} |
|
""" |
|
|
|
|
|
_DESCRIPTION = """\ |
|
GLEU metric can be used for any monolingual "translation" task, that is it can be used for Grammatical Error Correction and other text re-writing tasks. BLEU computes n-gram precisions over the reference but assigns more weight to n-grams that have been correctly changed from the source. GLEU rewards corrections while also correctly crediting unchanged source text. |
|
""" |
|
|
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Calculates how good are predictions given some references, using certain scores |
|
Args: |
|
sources: Source language reference sentences. This is assumed to be same as references if not provided. |
|
references: Reference for each prediction. Each reference should be a string with tokens separated by spaces. |
|
predictions: list of predictions to score. Each prediction should be a string with tokens separated by spaces. |
|
Returns: |
|
mean_gleu_score: Average gleu_score over all predictions. |
|
SD: standard deviation |
|
|
|
Examples: |
|
|
|
>>> my_new_module = evaluate.load("my_new_module") |
|
>>> references=["We may in actual fact be communicating with a hoax Facebook acccount of a cyberfriend , which we assume to be real but in reality , it is a fake account ."] |
|
>>> results = my_new_module.compute(references=references, predictions=["We may of actual fact communicating with a hoax Facebook acccount of a cyber friend , which we assumed to be real but in reality , it is a fake account ."]) |
|
>>> print(results) |
|
{'mean_gleu_score': 0.6} |
|
|
|
>>> results = my_new_module.compute(references=references, predictions=["We may be in actual fact communicating with a hoax Facebook acccount of a cyber friend , we assume to be real but in reality , it is a fake account ."]) |
|
>>> print(results) |
|
{'mean_gleu_score': 0.62} |
|
|
|
>>> results = my_new_module.compute(references=references, predictions=["We may in actual fact communicating with a hoax Facebook account of a cyber friend , which we assume to be real but in reality , it is a fake accounts ."]) |
|
>>> print(results) |
|
{'mean_gleu_score': 0.64} |
|
|
|
""" |
|
|
|
class GLEU(): |
|
def __init__(self, order=4) : |
|
self.order = order |
|
|
|
def load_hypothesis_sentence(self,hypothesis) : |
|
self.hlen = len(hypothesis) |
|
self.this_h_ngrams = [self.get_ngram_counts(hypothesis,n) |
|
for n in range(1,self.order+1) ] |
|
|
|
def load_sources(self,source_sents) : |
|
self.all_s_ngrams = [[self.get_ngram_counts(source_sent.split(),n) |
|
for n in range(1,self.order+1) ] |
|
for source_sent in source_sents ] |
|
|
|
def load_references(self,ref_sents) : |
|
self.refs = [ [] for i in range(len(self.all_s_ngrams)) ] |
|
self.rlens = [ [] for i in range(len(self.all_s_ngrams)) ] |
|
for i, ref_sent in enumerate(ref_sents) : |
|
self.refs[i].append(ref_sent.split()) |
|
self.rlens[i].append(len(ref_sent.split())) |
|
|
|
|
|
self.all_rngrams_freq = [ Counter() for i in range(self.order) ] |
|
|
|
self.all_r_ngrams = [ ] |
|
for refset in self.refs : |
|
all_ngrams = [] |
|
self.all_r_ngrams.append(all_ngrams) |
|
|
|
for n in range(1,self.order+1) : |
|
ngrams = self.get_ngram_counts(refset[0],n) |
|
all_ngrams.append(ngrams) |
|
|
|
for k in ngrams.keys() : |
|
self.all_rngrams_freq[n-1][k]+=1 |
|
|
|
for ref in refset[1:] : |
|
new_ngrams = self.get_ngram_counts(ref,n) |
|
for nn in new_ngrams.elements() : |
|
if new_ngrams[nn] > ngrams.get(nn,0) : |
|
ngrams[nn] = new_ngrams[nn] |
|
|
|
def get_ngram_counts(self,sentence,n) : |
|
return Counter([tuple(sentence[i:i+n]) for i in range(len(sentence)+1-n)]) |
|
|
|
|
|
def get_ngram_diff(self,a,b) : |
|
diff = Counter(a) |
|
for k in (set(a) & set(b)) : |
|
del diff[k] |
|
return diff |
|
|
|
def normalization(self,ngram,n) : |
|
return 1.0*self.all_rngrams_freq[n-1][ngram]/len(self.rlens[0]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def gleu_stats(self,i,r_ind=None): |
|
|
|
hlen = self.hlen |
|
rlen = self.rlens[i][r_ind] |
|
|
|
yield hlen |
|
yield rlen |
|
|
|
for n in range(1,self.order+1): |
|
h_ngrams = self.this_h_ngrams[n-1] |
|
s_ngrams = self.all_s_ngrams[i][n-1] |
|
r_ngrams = self.get_ngram_counts(self.refs[i][r_ind],n) |
|
|
|
s_ngram_diff = self.get_ngram_diff(s_ngrams,r_ngrams) |
|
|
|
yield max([ sum( (h_ngrams & r_ngrams).values() ) - \ |
|
sum( (h_ngrams & s_ngram_diff).values() ), 0 ]) |
|
|
|
yield max([hlen+1-n, 0]) |
|
|
|
|
|
def compute_gleu(self,stats,smooth=False): |
|
|
|
if smooth : |
|
stats = [ s if s != 0 else 1 for s in stats ] |
|
if len(list(filter(lambda x: x==0, stats))) > 0: |
|
return 0 |
|
(c, r) = stats[:2] |
|
log_gleu_prec = sum([log(float(x)/y) for x,y in zip(stats[2::2],stats[3::2])]) / 4 |
|
return exp(min([0, 1-float(r)/c]) + log_gleu_prec) |
|
|
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class gleu(evaluate.Metric): |
|
"""TODO: Short description of my evaluation module.""" |
|
|
|
def _info(self): |
|
|
|
return evaluate.MetricInfo( |
|
|
|
module_type="metric", |
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
|
features=datasets.Features( |
|
{ |
|
"predictions": datasets.Value("string", id="sequence"), |
|
"references": datasets.Value("string", id="sequence"), |
|
} |
|
), |
|
codebase_urls=["https://github.com/cnap/gec-ranking/"], |
|
) |
|
|
|
def _download_and_prepare(self, dl_manager): |
|
"""Optional: download external resources useful to compute the scores""" |
|
|
|
pass |
|
|
|
def _compute(self, references, predictions): |
|
"""Returns the scores""" |
|
|
|
num_iterations = 500 |
|
|
|
order=4 |
|
|
|
if len(references)==1: |
|
num_iterations = 1 |
|
|
|
gleu_calculator = GLEU(order=order) |
|
|
|
|
|
|
|
|
|
|
|
gleu_calculator.load_sources(references) |
|
gleu_calculator.load_references(references) |
|
|
|
|
|
|
|
indices = [] |
|
for j in range(num_iterations) : |
|
seed(j*101) |
|
indices.append([randint(0,len(references)-1) for i in range(len(predictions))]) |
|
|
|
iter_stats = [[0 for i in range(2*order+2)] for j in range(num_iterations) ] |
|
|
|
for i,h in enumerate(predictions) : |
|
gleu_calculator.load_hypothesis_sentence(h) |
|
|
|
|
|
|
|
|
|
stats_by_ref = [ None for r in range(len(references)) ] |
|
|
|
for j in range(num_iterations) : |
|
ref = indices[j][i] |
|
this_stats = stats_by_ref[ref] |
|
|
|
if this_stats is None : |
|
this_stats = [ s for s in gleu_calculator.gleu_stats(i,r_ind=ref) ] |
|
stats_by_ref[ref] = this_stats |
|
|
|
iter_stats[j] = [sum(scores) for scores in zip(iter_stats[j], this_stats)] |
|
|
|
sent_scores = [gleu_calculator.compute_gleu(stats) for stats in iter_stats] |
|
mean_score = mean(sent_scores) |
|
std_score = round(std(sent_scores),2) |
|
print(mean_score, sent_scores) |
|
return {"mean_gleu_score": mean_score} |