Spaces:

venkatasg
/

gleu

Sleeping

App Files Files Community

gleu / gleu.py

venkatasg

updated emojis

3e76455 4 months ago

raw

history blame contribute delete

10.3 kB

	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import evaluate
	import datasets
	from collections import Counter
	from math import log, exp
	from random import seed, randint
	from numpy import mean, std, round


	# TODO: Add BibTeX citation
	_CITATION = """\
	@InProceedings{napoles-EtAl:2015:ACL-IJCNLP,
	author = {Napoles, Courtney and Sakaguchi, Keisuke and Post, Matt and Tetreault, Joel},
	title = {Ground Truth for Grammatical Error Correction Metrics},
	booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)},
	month = {July},
	year = {2015},
	address = {Beijing, China},
	publisher = {Association for Computational Linguistics},
	pages = {588--593},
	url = {http://www.aclweb.org/anthology/P15-2097}
	}
	"""

	# TODO: Add description of the module here
	_DESCRIPTION = """\
	GLEU metric can be used for any monolingual "translation" task, that is it can be used for Grammatical Error Correction and other text re-writing tasks. BLEU computes n-gram precisions over the reference but assigns more weight to n-grams that have been correctly changed from the source. GLEU rewards corrections while also correctly crediting unchanged source text.
	"""


	# TODO: Add description of the arguments of the module here
	_KWARGS_DESCRIPTION = """
	Calculates how good are predictions given some references, using certain scores
	Args:
	sources: Source language reference sentences. This is assumed to be same as references if not provided.
	references: Reference for each prediction. Each reference should be a string with tokens separated by spaces.
	predictions: list of predictions to score. Each prediction should be a string with tokens separated by spaces.
	Returns:
	mean_gleu_score: Average gleu_score over all predictions.
	SD: standard deviation

	Examples:

	>>> my_new_module = evaluate.load("my_new_module")
	>>> references=["We may in actual fact be communicating with a hoax Facebook acccount of a cyberfriend , which we assume to be real but in reality , it is a fake account ."]
	>>> results = my_new_module.compute(references=references, predictions=["We may of actual fact communicating with a hoax Facebook acccount of a cyber friend , which we assumed to be real but in reality , it is a fake account ."])
	>>> print(results)
	{'mean_gleu_score': 0.6}

	>>> results = my_new_module.compute(references=references, predictions=["We may be in actual fact communicating with a hoax Facebook acccount of a cyber friend , we assume to be real but in reality , it is a fake account ."])
	>>> print(results)
	{'mean_gleu_score': 0.62}

	>>> results = my_new_module.compute(references=references, predictions=["We may in actual fact communicating with a hoax Facebook account of a cyber friend , which we assume to be real but in reality , it is a fake accounts ."])
	>>> print(results)
	{'mean_gleu_score': 0.64}

	"""

	class GLEU():
	def __init__(self, order=4) :
	self.order = order

	def load_hypothesis_sentence(self,hypothesis) :
	self.hlen = len(hypothesis)
	self.this_h_ngrams = [self.get_ngram_counts(hypothesis,n)
	for n in range(1,self.order+1) ]

	def load_sources(self,source_sents) :
	self.all_s_ngrams = [[self.get_ngram_counts(source_sent.split(),n)
	for n in range(1,self.order+1) ]
	for source_sent in source_sents ]

	def load_references(self,ref_sents) :
	self.refs = [ [] for i in range(len(self.all_s_ngrams)) ]
	self.rlens = [ [] for i in range(len(self.all_s_ngrams)) ]
	for i, ref_sent in enumerate(ref_sents) :
	self.refs[i].append(ref_sent.split())
	self.rlens[i].append(len(ref_sent.split()))

	# count number of references each n-gram appear sin
	self.all_rngrams_freq = [ Counter() for i in range(self.order) ]

	self.all_r_ngrams = [ ]
	for refset in self.refs :
	all_ngrams = []
	self.all_r_ngrams.append(all_ngrams)

	for n in range(1,self.order+1) :
	ngrams = self.get_ngram_counts(refset[0],n)
	all_ngrams.append(ngrams)

	for k in ngrams.keys() :
	self.all_rngrams_freq[n-1][k]+=1

	for ref in refset[1:] :
	new_ngrams = self.get_ngram_counts(ref,n)
	for nn in new_ngrams.elements() :
	if new_ngrams[nn] > ngrams.get(nn,0) :
	ngrams[nn] = new_ngrams[nn]

	def get_ngram_counts(self,sentence,n) :
	return Counter([tuple(sentence[i:i+n]) for i in range(len(sentence)+1-n)])

	# returns ngrams in a but not in b
	def get_ngram_diff(self,a,b) :
	diff = Counter(a)
	for k in (set(a) & set(b)) :
	del diff[k]
	return diff

	def normalization(self,ngram,n) :
	return 1.0*self.all_rngrams_freq[n-1][ngram]/len(self.rlens[0])

	# Collect BLEU-relevant statistics for a single hypothesis/reference pair.
	# Return value is a generator yielding:
	# (c, r, numerator1, denominator1, ... numerator4, denominator4)
	# Summing the columns across calls to this function on an entire corpus
	# will produce a vector of statistics that can be used to compute GLEU
	def gleu_stats(self,i,r_ind=None):

	hlen = self.hlen
	rlen = self.rlens[i][r_ind]

	yield hlen
	yield rlen

	for n in range(1,self.order+1):
	h_ngrams = self.this_h_ngrams[n-1]
	s_ngrams = self.all_s_ngrams[i][n-1]
	r_ngrams = self.get_ngram_counts(self.refs[i][r_ind],n)

	s_ngram_diff = self.get_ngram_diff(s_ngrams,r_ngrams)

	yield max([ sum( (h_ngrams & r_ngrams).values() ) - \
	sum( (h_ngrams & s_ngram_diff).values() ), 0 ])

	yield max([hlen+1-n, 0])

	# Compute GLEU from collected statistics obtained by call(s) to gleu_stats
	def compute_gleu(self,stats,smooth=False):
	# smooth 0 counts for sentence-level scores
	if smooth :
	stats = [ s if s != 0 else 1 for s in stats ]
	if len(list(filter(lambda x: x==0, stats))) > 0:
	return 0
	(c, r) = stats[:2]
	log_gleu_prec = sum([log(float(x)/y) for x,y in zip(stats[2::2],stats[3::2])]) / 4
	return exp(min([0, 1-float(r)/c]) + log_gleu_prec)



	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class gleu(evaluate.Metric):
	"""TODO: Short description of my evaluation module."""

	def _info(self):
	# TODO: Specifies the evaluate.EvaluationModuleInfo object
	return evaluate.MetricInfo(
	# This is the description that will appear on the modules page.
	module_type="metric",
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	# This defines the format of each prediction and reference
	features=datasets.Features(
	{
	"predictions": datasets.Value("string", id="sequence"),
	"references": datasets.Value("string", id="sequence"),
	}
	),
	codebase_urls=["https://github.com/cnap/gec-ranking/"],
	)

	def _download_and_prepare(self, dl_manager):
	"""Optional: download external resources useful to compute the scores"""
	# TODO: Download external resources if needed
	pass

	def _compute(self, references, predictions):
	"""Returns the scores"""

	num_iterations = 500

	order=4

	if len(references)==1:
	num_iterations = 1

	gleu_calculator = GLEU(order=order)

	# if sources:
	# gleu_calculator.load_sources(sources)
	# else:
	#
	gleu_calculator.load_sources(references)
	gleu_calculator.load_references(references)

	# first generate a random list of indices, using a different seed
	# for each iteration
	indices = []
	for j in range(num_iterations) :
	seed(j*101)
	indices.append([randint(0,len(references)-1) for i in range(len(predictions))])

	iter_stats = [[0 for i in range(2*order+2)] for j in range(num_iterations) ]

	for i,h in enumerate(predictions) :
	gleu_calculator.load_hypothesis_sentence(h)

	# we are going to store the score of this sentence for each ref
	# so we don't have to recalculate them 500 times

	stats_by_ref = [ None for r in range(len(references)) ]

	for j in range(num_iterations) :
	ref = indices[j][i]
	this_stats = stats_by_ref[ref]

	if this_stats is None :
	this_stats = [ s for s in gleu_calculator.gleu_stats(i,r_ind=ref) ]
	stats_by_ref[ref] = this_stats

	iter_stats[j] = [sum(scores) for scores in zip(iter_stats[j], this_stats)]

	sent_scores = [gleu_calculator.compute_gleu(stats) for stats in iter_stats]
	mean_score = mean(sent_scores)
	std_score = round(std(sent_scores),2)
	print(mean_score, sent_scores)
	return {"mean_gleu_score": mean_score}