Spaces:
Build error
Build error
""" BLEU metric. """ | |
import datasets | |
_CITATION = """\ | |
@INPROCEEDINGS{Papineni02bleu:a, | |
author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu}, | |
title = {BLEU: a Method for Automatic Evaluation of Machine Translation}, | |
booktitle = {}, | |
year = {2002}, | |
pages = {311--318} | |
} | |
@inproceedings{lin-och-2004-orange, | |
title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation", | |
author = "Lin, Chin-Yew and | |
Och, Franz Josef", | |
booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics", | |
month = "aug 23{--}aug 27", | |
year = "2004", | |
address = "Geneva, Switzerland", | |
publisher = "COLING", | |
url = "https://www.aclweb.org/anthology/C04-1072", | |
pages = "501--507", | |
} | |
""" | |
_DESCRIPTION = """\ | |
BLEU (bilingual evaluation understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another. | |
Quality is considered to be the correspondence between a machine's output and that of a human: "the closer a machine translation is to a professional human translation, | |
the better it is" – this is the central idea behind BLEU. BLEU was one of the first metrics to claim a high correlation with human judgements of quality, and | |
remains one of the most popular automated and inexpensive metrics. | |
Scores are calculated for individual translated segments—generally sentences—by comparing them with a set of good quality reference translations. | |
Those scores are then averaged over the whole corpus to reach an estimate of the translation's overall quality. Intelligibility or grammatical correctness | |
are not taken into account[citation needed]. | |
BLEU's output is always a number between 0 and 1. This value indicates how similar the candidate text is to the reference texts, with values closer to 1 | |
representing more similar texts. Few human translations will attain a score of 1, since this would indicate that the candidate is identical to one of the | |
reference translations. For this reason, it is not necessary to attain a score of 1. Because there are more opportunities to match, adding additional | |
reference translations will increase the BLEU score. | |
""" | |
_KWARGS_DESCRIPTION = """ | |
Computes BLEU score of translated segments against one or more references. | |
Args: | |
predictions: list of translations to score. | |
Each translation should be tokenized into a list of tokens. | |
references: list of lists of references for each translation. | |
Each reference should be tokenized into a list of tokens. | |
max_order: Maximum n-gram order to use when computing BLEU score. | |
smooth: Whether or not to apply Lin et al. 2004 smoothing. | |
Returns: | |
'bleu': bleu score, | |
'precisions': geometric mean of n-gram precisions, | |
'brevity_penalty': brevity penalty, | |
'length_ratio': ratio of lengths, | |
'translation_length': translation_length, | |
'reference_length': reference_length | |
Examples: | |
>>> predictions = [ | |
... ["hello", "there", "general", "kenobi"], # tokenized prediction of the first sample | |
... ["foo", "bar", "foobar"] # tokenized prediction of the second sample | |
... ] | |
>>> references = [ | |
... [["hello", "there", "general", "kenobi"], ["hello", "there", "!"]], # tokenized references for the first sample (2 references) | |
... [["foo", "bar", "foobar"]] # tokenized references for the second sample (1 reference) | |
... ] | |
>>> bleu = datasets.load_metric("bleu") | |
>>> results = bleu.compute(predictions=predictions, references=references) | |
>>> print(results["bleu"]) | |
1.0 | |
""" | |
class Bleu(datasets.Metric): | |
def _info(self): | |
return datasets.MetricInfo( | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
features=datasets.Features( | |
{ | |
"predictions": datasets.Sequence(datasets.Value("string", id="token"), id="sequence"), | |
"references": datasets.Sequence( | |
datasets.Sequence(datasets.Value("string", id="token"), id="sequence"), id="references" | |
), | |
} | |
), | |
codebase_urls=["https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py"], | |
reference_urls=[ | |
"https://en.wikipedia.org/wiki/BLEU", | |
"https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213", | |
], | |
) | |
def _compute(self, predictions, references, max_order=4, smooth=False): | |
score = compute_bleu( | |
reference_corpus=references, translation_corpus=predictions, max_order=max_order, smooth=smooth | |
) | |
(bleu, precisions, bp, ratio, translation_length, reference_length) = score | |
return { | |
"bleu": bleu, | |
"precisions": precisions, | |
"brevity_penalty": bp, | |
"length_ratio": ratio, | |
"translation_length": translation_length, | |
"reference_length": reference_length, | |
} | |