Spaces:

evaluate-metric
/

frugalscore

Running

App Files Files Community

frugalscore / frugalscore.py

lvwerra HF staff

Update Space (evaluate main: c447fc8e)

1698f0a over 1 year ago

raw history blame contribute delete

No virus

4.61 kB

	# Copyright 2022 The HuggingFace Datasets Authors and the current metric script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""FrugalScore metric."""

	import datasets
	import torch
	from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

	import evaluate


	_CITATION = """\
	@article{eddine2021frugalscore,
	title={FrugalScore: Learning Cheaper, Lighter and Faster Evaluation Metrics for Automatic Text Generation},
	author={Eddine, Moussa Kamal and Shang, Guokan and Tixier, Antoine J-P and Vazirgiannis, Michalis},
	journal={arXiv preprint arXiv:2110.08559},
	year={2021}
	}
	"""

	_DESCRIPTION = """\
	FrugalScore is a reference-based metric for NLG models evaluation. It is based on a distillation approach that allows to learn a fixed, low cost version of any expensive NLG metric, while retaining most of its original performance.
	"""


	_KWARGS_DESCRIPTION = """
	Calculates how good are predictions given some references, using certain scores.
	Args:
	predictions (list of str): list of predictions to score. Each predictions
	should be a string.
	references (list of str): list of reference for each prediction. Each
	reference should be a string.
	batch_size (int): the batch size for predictions.
	max_length (int): maximum sequence length.
	device (str): either gpu or cpu
	Returns:
	scores (list of int): list of scores.
	Examples:
	>>> frugalscore = evaluate.load("frugalscore")
	>>> results = frugalscore.compute(predictions=['hello there', 'huggingface'], references=['hello world', 'hugging face'])
	>>> print([round(s, 3) for s in results["scores"]])
	[0.631, 0.645]
	"""


	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class FRUGALSCORE(evaluate.Metric):
	def _info(self):
	return evaluate.MetricInfo(
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	features=datasets.Features(
	{
	"predictions": datasets.Value("string"),
	"references": datasets.Value("string"),
	}
	),
	homepage="https://github.com/moussaKam/FrugalScore",
	)

	def _download_and_prepare(self, dl_manager):
	if self.config_name == "default":
	checkpoint = "moussaKam/frugalscore_tiny_bert-base_bert-score"
	else:
	checkpoint = self.config_name
	self.model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
	self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)

	def _compute(
	self,
	predictions,
	references,
	batch_size=32,
	max_length=128,
	device=None,
	):
	"""Returns the scores"""
	assert len(predictions) == len(
	references
	), "predictions and references should have the same number of sentences."
	if device is not None:
	assert device in ["gpu", "cpu"], "device should be either gpu or cpu."
	else:
	device = "gpu" if torch.cuda.is_available() else "cpu"
	training_args = TrainingArguments(
	"trainer",
	fp16=(device == "gpu"),
	per_device_eval_batch_size=batch_size,
	report_to="all",
	no_cuda=(device == "cpu"),
	log_level="warning",
	)
	dataset = {"sentence1": predictions, "sentence2": references}
	raw_datasets = datasets.Dataset.from_dict(dataset)

	def tokenize_function(data):
	return self.tokenizer(
	data["sentence1"], data["sentence2"], max_length=max_length, truncation=True, padding=True
	)

	tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
	tokenized_datasets.remove_columns(["sentence1", "sentence2"])
	trainer = Trainer(self.model, training_args, tokenizer=self.tokenizer)
	predictions = trainer.predict(tokenized_datasets)
	return {"scores": list(predictions.predictions.squeeze(-1))}