gleu / gleu.py
venkatasg's picture
updated emojis
3e76455
raw
history blame
10.3 kB
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import evaluate
import datasets
from collections import Counter
from math import log, exp
from random import seed, randint
from numpy import mean, std, round
# TODO: Add BibTeX citation
_CITATION = """\
@InProceedings{napoles-EtAl:2015:ACL-IJCNLP,
author = {Napoles, Courtney and Sakaguchi, Keisuke and Post, Matt and Tetreault, Joel},
title = {Ground Truth for Grammatical Error Correction Metrics},
booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)},
month = {July},
year = {2015},
address = {Beijing, China},
publisher = {Association for Computational Linguistics},
pages = {588--593},
url = {http://www.aclweb.org/anthology/P15-2097}
}
"""
# TODO: Add description of the module here
_DESCRIPTION = """\
GLEU metric can be used for any monolingual "translation" task, that is it can be used for Grammatical Error Correction and other text re-writing tasks. BLEU computes n-gram precisions over the reference but assigns more weight to n-grams that have been correctly changed from the source. GLEU rewards corrections while also correctly crediting unchanged source text.
"""
# TODO: Add description of the arguments of the module here
_KWARGS_DESCRIPTION = """
Calculates how good are predictions given some references, using certain scores
Args:
sources: Source language reference sentences. This is assumed to be same as references if not provided.
references: Reference for each prediction. Each reference should be a string with tokens separated by spaces.
predictions: list of predictions to score. Each prediction should be a string with tokens separated by spaces.
Returns:
mean_gleu_score: Average gleu_score over all predictions.
SD: standard deviation
Examples:
>>> my_new_module = evaluate.load("my_new_module")
>>> references=["We may in actual fact be communicating with a hoax Facebook acccount of a cyberfriend , which we assume to be real but in reality , it is a fake account ."]
>>> results = my_new_module.compute(references=references, predictions=["We may of actual fact communicating with a hoax Facebook acccount of a cyber friend , which we assumed to be real but in reality , it is a fake account ."])
>>> print(results)
{'mean_gleu_score': 0.6}
>>> results = my_new_module.compute(references=references, predictions=["We may be in actual fact communicating with a hoax Facebook acccount of a cyber friend , we assume to be real but in reality , it is a fake account ."])
>>> print(results)
{'mean_gleu_score': 0.62}
>>> results = my_new_module.compute(references=references, predictions=["We may in actual fact communicating with a hoax Facebook account of a cyber friend , which we assume to be real but in reality , it is a fake accounts ."])
>>> print(results)
{'mean_gleu_score': 0.64}
"""
class GLEU():
def __init__(self, order=4) :
self.order = order
def load_hypothesis_sentence(self,hypothesis) :
self.hlen = len(hypothesis)
self.this_h_ngrams = [self.get_ngram_counts(hypothesis,n)
for n in range(1,self.order+1) ]
def load_sources(self,source_sents) :
self.all_s_ngrams = [[self.get_ngram_counts(source_sent.split(),n)
for n in range(1,self.order+1) ]
for source_sent in source_sents ]
def load_references(self,ref_sents) :
self.refs = [ [] for i in range(len(self.all_s_ngrams)) ]
self.rlens = [ [] for i in range(len(self.all_s_ngrams)) ]
for i, ref_sent in enumerate(ref_sents) :
self.refs[i].append(ref_sent.split())
self.rlens[i].append(len(ref_sent.split()))
# count number of references each n-gram appear sin
self.all_rngrams_freq = [ Counter() for i in range(self.order) ]
self.all_r_ngrams = [ ]
for refset in self.refs :
all_ngrams = []
self.all_r_ngrams.append(all_ngrams)
for n in range(1,self.order+1) :
ngrams = self.get_ngram_counts(refset[0],n)
all_ngrams.append(ngrams)
for k in ngrams.keys() :
self.all_rngrams_freq[n-1][k]+=1
for ref in refset[1:] :
new_ngrams = self.get_ngram_counts(ref,n)
for nn in new_ngrams.elements() :
if new_ngrams[nn] > ngrams.get(nn,0) :
ngrams[nn] = new_ngrams[nn]
def get_ngram_counts(self,sentence,n) :
return Counter([tuple(sentence[i:i+n]) for i in range(len(sentence)+1-n)])
# returns ngrams in a but not in b
def get_ngram_diff(self,a,b) :
diff = Counter(a)
for k in (set(a) & set(b)) :
del diff[k]
return diff
def normalization(self,ngram,n) :
return 1.0*self.all_rngrams_freq[n-1][ngram]/len(self.rlens[0])
# Collect BLEU-relevant statistics for a single hypothesis/reference pair.
# Return value is a generator yielding:
# (c, r, numerator1, denominator1, ... numerator4, denominator4)
# Summing the columns across calls to this function on an entire corpus
# will produce a vector of statistics that can be used to compute GLEU
def gleu_stats(self,i,r_ind=None):
hlen = self.hlen
rlen = self.rlens[i][r_ind]
yield hlen
yield rlen
for n in range(1,self.order+1):
h_ngrams = self.this_h_ngrams[n-1]
s_ngrams = self.all_s_ngrams[i][n-1]
r_ngrams = self.get_ngram_counts(self.refs[i][r_ind],n)
s_ngram_diff = self.get_ngram_diff(s_ngrams,r_ngrams)
yield max([ sum( (h_ngrams & r_ngrams).values() ) - \
sum( (h_ngrams & s_ngram_diff).values() ), 0 ])
yield max([hlen+1-n, 0])
# Compute GLEU from collected statistics obtained by call(s) to gleu_stats
def compute_gleu(self,stats,smooth=False):
# smooth 0 counts for sentence-level scores
if smooth :
stats = [ s if s != 0 else 1 for s in stats ]
if len(list(filter(lambda x: x==0, stats))) > 0:
return 0
(c, r) = stats[:2]
log_gleu_prec = sum([log(float(x)/y) for x,y in zip(stats[2::2],stats[3::2])]) / 4
return exp(min([0, 1-float(r)/c]) + log_gleu_prec)
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class gleu(evaluate.Metric):
"""TODO: Short description of my evaluation module."""
def _info(self):
# TODO: Specifies the evaluate.EvaluationModuleInfo object
return evaluate.MetricInfo(
# This is the description that will appear on the modules page.
module_type="metric",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
# This defines the format of each prediction and reference
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
codebase_urls=["https://github.com/cnap/gec-ranking/"],
)
def _download_and_prepare(self, dl_manager):
"""Optional: download external resources useful to compute the scores"""
# TODO: Download external resources if needed
pass
def _compute(self, references, predictions):
"""Returns the scores"""
num_iterations = 500
order=4
if len(references)==1:
num_iterations = 1
gleu_calculator = GLEU(order=order)
# if sources:
# gleu_calculator.load_sources(sources)
# else:
#
gleu_calculator.load_sources(references)
gleu_calculator.load_references(references)
# first generate a random list of indices, using a different seed
# for each iteration
indices = []
for j in range(num_iterations) :
seed(j*101)
indices.append([randint(0,len(references)-1) for i in range(len(predictions))])
iter_stats = [[0 for i in range(2*order+2)] for j in range(num_iterations) ]
for i,h in enumerate(predictions) :
gleu_calculator.load_hypothesis_sentence(h)
# we are going to store the score of this sentence for each ref
# so we don't have to recalculate them 500 times
stats_by_ref = [ None for r in range(len(references)) ]
for j in range(num_iterations) :
ref = indices[j][i]
this_stats = stats_by_ref[ref]
if this_stats is None :
this_stats = [ s for s in gleu_calculator.gleu_stats(i,r_ind=ref) ]
stats_by_ref[ref] = this_stats
iter_stats[j] = [sum(scores) for scores in zip(iter_stats[j], this_stats)]
sent_scores = [gleu_calculator.compute_gleu(stats) for stats in iter_stats]
mean_score = mean(sent_scores)
std_score = round(std(sent_scores),2)
print(mean_score, sent_scores)
return {"mean_gleu_score": mean_score}