Spaces:
Runtime error
Runtime error
File size: 7,021 Bytes
47e6046 4e1f99b 47e6046 4e1f99b 47e6046 93f5721 c6d7489 93f5721 47e6046 a0d970f 47e6046 8b38382 47e6046 c6d7489 a0d970f 47e6046 a0d970f c6d7489 a0d970f c6d7489 a0d970f 47e6046 a0d970f c6d7489 a0d970f c6d7489 a0d970f 47e6046 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
# Copyright 2020 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" METEOR metric. """
import datasets
import numpy as np
from nltk.translate import meteor_score
from packaging import version
import evaluate
if evaluate.config.PY_VERSION < version.parse("3.8"):
import importlib_metadata
else:
import importlib.metadata as importlib_metadata
NLTK_VERSION = version.parse(importlib_metadata.version("nltk"))
if NLTK_VERSION >= version.Version("3.6.4"):
from nltk import word_tokenize
_CITATION = """\
@inproceedings{banarjee2005,
title = {{METEOR}: An Automatic Metric for {MT} Evaluation with Improved Correlation with Human Judgments},
author = {Banerjee, Satanjeev and Lavie, Alon},
booktitle = {Proceedings of the {ACL} Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization},
month = jun,
year = {2005},
address = {Ann Arbor, Michigan},
publisher = {Association for Computational Linguistics},
url = {https://www.aclweb.org/anthology/W05-0909},
pages = {65--72},
}
"""
_DESCRIPTION = """\
METEOR, an automatic metric for machine translation evaluation
that is based on a generalized concept of unigram matching between the
machine-produced translation and human-produced reference translations.
Unigrams can be matched based on their surface forms, stemmed forms,
and meanings; furthermore, METEOR can be easily extended to include more
advanced matching strategies. Once all generalized unigram matches
between the two strings have been found, METEOR computes a score for
this matching using a combination of unigram-precision, unigram-recall, and
a measure of fragmentation that is designed to directly capture how
well-ordered the matched words in the machine translation are in relation
to the reference.
METEOR gets an R correlation value of 0.347 with human evaluation on the Arabic
data and 0.331 on the Chinese data. This is shown to be an improvement on
using simply unigram-precision, unigram-recall and their harmonic F1
combination.
"""
_KWARGS_DESCRIPTION = """
Computes METEOR score of translated segments against one or more references.
Args:
predictions: list of predictions to score. Each prediction
should be a string with tokens separated by spaces.
references: list of reference for each prediction. Each
reference should be a string with tokens separated by spaces.
alpha: Parameter for controlling relative weights of precision and recall. default: 0.9
beta: Parameter for controlling shape of penalty as a function of fragmentation. default: 3
gamma: Relative weight assigned to fragmentation penalty. default: 0.5
Returns:
'meteor': meteor score.
Examples:
>>> meteor = evaluate.load('meteor')
>>> predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
>>> references = ["It is a guide to action that ensures that the military will forever heed Party commands"]
>>> results = meteor.compute(predictions=predictions, references=references)
>>> print(round(results["meteor"], 4))
0.6944
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Meteor(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=[
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
}
),
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
],
codebase_urls=["https://github.com/nltk/nltk/blob/develop/nltk/translate/meteor_score.py"],
reference_urls=[
"https://www.nltk.org/api/nltk.translate.html#module-nltk.translate.meteor_score",
"https://en.wikipedia.org/wiki/METEOR",
],
)
def _download_and_prepare(self, dl_manager):
import nltk
nltk.download("wordnet")
if NLTK_VERSION >= version.Version("3.9.0"):
nltk.download("punkt_tab")
elif NLTK_VERSION >= version.Version("3.6.5"):
nltk.download("punkt")
if NLTK_VERSION >= version.Version("3.6.6"):
nltk.download("omw-1.4")
def _compute(self, predictions, references, alpha=0.9, beta=3, gamma=0.5):
multiple_refs = isinstance(references[0], list)
if NLTK_VERSION >= version.Version("3.6.5"):
# the version of METEOR in NLTK version 3.6.5 and earlier expect tokenized inputs
if multiple_refs:
scores = [
meteor_score.meteor_score(
[word_tokenize(ref) for ref in refs],
word_tokenize(pred),
alpha=alpha,
beta=beta,
gamma=gamma,
)
for refs, pred in zip(references, predictions)
]
else:
scores = [
meteor_score.single_meteor_score(
word_tokenize(ref), word_tokenize(pred), alpha=alpha, beta=beta, gamma=gamma
)
for ref, pred in zip(references, predictions)
]
else:
if multiple_refs:
scores = [
meteor_score.meteor_score(
[[word_tokenize(ref) for ref in group] for group in references][0],
word_tokenize(pred),
alpha=alpha,
beta=beta,
gamma=gamma,
)
for ref, pred in zip(references, predictions)
]
else:
scores = [
meteor_score.single_meteor_score(ref, pred, alpha=alpha, beta=beta, gamma=gamma)
for ref, pred in zip(references, predictions)
]
return {"meteor": np.mean(scores)}
|