# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import evaluate import datasets from collections import Counter from math import log, exp from random import seed, randint from numpy import mean, std, round # TODO: Add BibTeX citation _CITATION = """\ @InProceedings{napoles-EtAl:2015:ACL-IJCNLP, author = {Napoles, Courtney and Sakaguchi, Keisuke and Post, Matt and Tetreault, Joel}, title = {Ground Truth for Grammatical Error Correction Metrics}, booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)}, month = {July}, year = {2015}, address = {Beijing, China}, publisher = {Association for Computational Linguistics}, pages = {588--593}, url = {http://www.aclweb.org/anthology/P15-2097} } """ # TODO: Add description of the module here _DESCRIPTION = """\ GLEU metric can be used for any monolingual "translation" task, that is it can be used for Grammatical Error Correction and other text re-writing tasks. BLEU computes n-gram precisions over the reference but assigns more weight to n-grams that have been correctly changed from the source. GLEU rewards corrections while also correctly crediting unchanged source text. """ # TODO: Add description of the arguments of the module here _KWARGS_DESCRIPTION = """ Calculates how good are predictions given some references, using certain scores Args: sources: Source language reference sentences. This is assumed to be same as references if not provided. references: Reference for each prediction. Each reference should be a string with tokens separated by spaces. predictions: list of predictions to score. Each prediction should be a string with tokens separated by spaces. Returns: mean_gleu_score: Average gleu_score over all predictions. SD: standard deviation Examples: >>> my_new_module = evaluate.load("my_new_module") >>> references=["We may in actual fact be communicating with a hoax Facebook acccount of a cyberfriend , which we assume to be real but in reality , it is a fake account ."] >>> results = my_new_module.compute(references=references, predictions=["We may of actual fact communicating with a hoax Facebook acccount of a cyber friend , which we assumed to be real but in reality , it is a fake account ."]) >>> print(results) {'mean_gleu_score': 0.6} >>> results = my_new_module.compute(references=references, predictions=["We may be in actual fact communicating with a hoax Facebook acccount of a cyber friend , we assume to be real but in reality , it is a fake account ."]) >>> print(results) {'mean_gleu_score': 0.62} >>> results = my_new_module.compute(references=references, predictions=["We may in actual fact communicating with a hoax Facebook account of a cyber friend , which we assume to be real but in reality , it is a fake accounts ."]) >>> print(results) {'mean_gleu_score': 0.64} """ class GLEU(): def __init__(self, order=4) : self.order = order def load_hypothesis_sentence(self,hypothesis) : self.hlen = len(hypothesis) self.this_h_ngrams = [self.get_ngram_counts(hypothesis,n) for n in range(1,self.order+1) ] def load_sources(self,source_sents) : self.all_s_ngrams = [[self.get_ngram_counts(source_sent.split(),n) for n in range(1,self.order+1) ] for source_sent in source_sents ] def load_references(self,ref_sents) : self.refs = [ [] for i in range(len(self.all_s_ngrams)) ] self.rlens = [ [] for i in range(len(self.all_s_ngrams)) ] for i, ref_sent in enumerate(ref_sents) : self.refs[i].append(ref_sent.split()) self.rlens[i].append(len(ref_sent.split())) # count number of references each n-gram appear sin self.all_rngrams_freq = [ Counter() for i in range(self.order) ] self.all_r_ngrams = [ ] for refset in self.refs : all_ngrams = [] self.all_r_ngrams.append(all_ngrams) for n in range(1,self.order+1) : ngrams = self.get_ngram_counts(refset[0],n) all_ngrams.append(ngrams) for k in ngrams.keys() : self.all_rngrams_freq[n-1][k]+=1 for ref in refset[1:] : new_ngrams = self.get_ngram_counts(ref,n) for nn in new_ngrams.elements() : if new_ngrams[nn] > ngrams.get(nn,0) : ngrams[nn] = new_ngrams[nn] def get_ngram_counts(self,sentence,n) : return Counter([tuple(sentence[i:i+n]) for i in range(len(sentence)+1-n)]) # returns ngrams in a but not in b def get_ngram_diff(self,a,b) : diff = Counter(a) for k in (set(a) & set(b)) : del diff[k] return diff def normalization(self,ngram,n) : return 1.0*self.all_rngrams_freq[n-1][ngram]/len(self.rlens[0]) # Collect BLEU-relevant statistics for a single hypothesis/reference pair. # Return value is a generator yielding: # (c, r, numerator1, denominator1, ... numerator4, denominator4) # Summing the columns across calls to this function on an entire corpus # will produce a vector of statistics that can be used to compute GLEU def gleu_stats(self,i,r_ind=None): hlen = self.hlen rlen = self.rlens[i][r_ind] yield hlen yield rlen for n in range(1,self.order+1): h_ngrams = self.this_h_ngrams[n-1] s_ngrams = self.all_s_ngrams[i][n-1] r_ngrams = self.get_ngram_counts(self.refs[i][r_ind],n) s_ngram_diff = self.get_ngram_diff(s_ngrams,r_ngrams) yield max([ sum( (h_ngrams & r_ngrams).values() ) - \ sum( (h_ngrams & s_ngram_diff).values() ), 0 ]) yield max([hlen+1-n, 0]) # Compute GLEU from collected statistics obtained by call(s) to gleu_stats def compute_gleu(self,stats,smooth=False): # smooth 0 counts for sentence-level scores if smooth : stats = [ s if s != 0 else 1 for s in stats ] if len(list(filter(lambda x: x==0, stats))) > 0: return 0 (c, r) = stats[:2] log_gleu_prec = sum([log(float(x)/y) for x,y in zip(stats[2::2],stats[3::2])]) / 4 return exp(min([0, 1-float(r)/c]) + log_gleu_prec) @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class gleu(evaluate.Metric): """TODO: Short description of my evaluation module.""" def _info(self): # TODO: Specifies the evaluate.EvaluationModuleInfo object return evaluate.MetricInfo( # This is the description that will appear on the modules page. module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=datasets.Features( { "predictions": datasets.Value("string", id="sequence"), "references": datasets.Value("string", id="sequence"), } ), codebase_urls=["https://github.com/cnap/gec-ranking/"], ) def _download_and_prepare(self, dl_manager): """Optional: download external resources useful to compute the scores""" # TODO: Download external resources if needed pass def _compute(self, references, predictions): """Returns the scores""" num_iterations = 500 order=4 if len(references)==1: num_iterations = 1 gleu_calculator = GLEU(order=order) # if sources: # gleu_calculator.load_sources(sources) # else: # gleu_calculator.load_sources(references) gleu_calculator.load_references(references) # first generate a random list of indices, using a different seed # for each iteration indices = [] for j in range(num_iterations) : seed(j*101) indices.append([randint(0,len(references)-1) for i in range(len(predictions))]) iter_stats = [[0 for i in range(2*order+2)] for j in range(num_iterations) ] for i,h in enumerate(predictions) : gleu_calculator.load_hypothesis_sentence(h) # we are going to store the score of this sentence for each ref # so we don't have to recalculate them 500 times stats_by_ref = [ None for r in range(len(references)) ] for j in range(num_iterations) : ref = indices[j][i] this_stats = stats_by_ref[ref] if this_stats is None : this_stats = [ s for s in gleu_calculator.gleu_stats(i,r_ind=ref) ] stats_by_ref[ref] = this_stats iter_stats[j] = [sum(scores) for scores in zip(iter_stats[j], this_stats)] sent_scores = [gleu_calculator.compute_gleu(stats) for stats in iter_stats] mean_score = mean(sent_scores) std_score = round(std(sent_scores),2) print(mean_score, sent_scores) return {"mean_gleu_score": mean_score}