Spaces:

CZLC
/

rouge_raw

Runtime error

rouge_raw / rouge_raw.py

Martin Dočekal

example fix

d2429ca over 1 year ago

15 kB

	# -- coding: UTF-8 --
	"""
	Created on 02.02.24
	Module for raw ROUGE score calculation from:
	@inproceedings{straka-etal-2018-sumeczech,
	title = "{S}ume{C}zech: Large {C}zech News-Based Summarization Dataset",
	author = "Straka, Milan and
	Mediankin, Nikita and
	Kocmi, Tom and
	{\v{Z}}abokrtsk{\'y}, Zden{\v{e}}k and
	Hude{\v{c}}ek, Vojt{\v{e}}ch and
	Haji{\v{c}}, Jan",
	editor = "Calzolari, Nicoletta and
	Choukri, Khalid and
	Cieri, Christopher and
	Declerck, Thierry and
	Goggi, Sara and
	Hasida, Koiti and
	Isahara, Hitoshi and
	Maegaard, Bente and
	Mariani, Joseph and
	Mazo, H{\'e}l{\`e}ne and
	Moreno, Asuncion and
	Odijk, Jan and
	Piperidis, Stelios and
	Tokunaga, Takenobu",
	booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
	month = may,
	year = "2018",
	address = "Miyazaki, Japan",
	publisher = "European Language Resources Association (ELRA)",
	url = "https://aclanthology.org/L18-1551",
	}


	:author: Martin Dočekal
	"""
	import collections
	import re
	from typing import Sequence, Optional

	import datasets
	import evaluate
	import numpy as np


	class AggregateScore(collections.namedtuple("AggregateScore", ["low", "mid", "high"])):
	"""
	Tuple containing confidence intervals for scores.
	Taken from: https://github.com/google-research/google-research/blob/master/rouge/scoring.py
	"""


	class Score(
	collections.namedtuple("Score", ["precision", "recall", "fmeasure"])):
	"""Tuple containing precision, recall, and f-measure values."""


	class BootstrapAggregator(object):
	"""Aggregates scores to provide confidence intervals.
	Taken from: https://github.com/google-research/google-research/blob/master/rouge/scoring.py

	Sample usage:
	scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'])
	aggregator = Aggregator()
	aggregator.add_scores(scorer.score("one two three", "one two"))
	aggregator.add_scores(scorer.score("one two five six", "seven eight"))
	result = aggregator.aggregate()
	print result
	{'rougeL': AggregateScore(
	low=Score(precision=0.0, recall=0.0, fmeasure=0.0),
	mid=Score(precision=0.5, recall=0.33, fmeasure=0.40),
	high=Score(precision=1.0, recall=0.66, fmeasure=0.80)),
	'rouge1': AggregateScore(
	low=Score(precision=0.0, recall=0.0, fmeasure=0.0),
	mid=Score(precision=0.5, recall=0.33, fmeasure=0.40),
	high=Score(precision=1.0, recall=0.66, fmeasure=0.80))}
	"""

	def __init__(self, confidence_interval=0.95, n_samples=1000):
	"""Initializes a BootstrapAggregator object.

	Args:
	confidence_interval: Confidence interval to compute on the mean as a
	decimal.
	n_samples: Number of samples to use for bootstrap resampling.

	Raises:
	ValueError: If invalid argument is given.
	"""

	if confidence_interval < 0 or confidence_interval > 1:
	raise ValueError("confidence_interval must be in range [0, 1]")
	if n_samples <= 0:
	raise ValueError("n_samples must be positive")

	self._n_samples = n_samples
	self._confidence_interval = confidence_interval
	self._scores = collections.defaultdict(list)

	def add_scores(self, scores):
	"""Adds a sample for future aggregation.

	Args:
	scores: Dict mapping score_type strings to a namedtuple object/class
	representing a score.
	"""

	for score_type, score in scores.items():
	self._scores[score_type].append(score)

	def aggregate(self):
	"""Aggregates scores previously added using add_scores.

	Returns:
	A dict mapping score_type to AggregateScore objects.
	"""

	result = {}
	for score_type, scores in self._scores.items():
	# Stack scores into a 2-d matrix of (sample, measure).
	score_matrix = np.vstack(tuple(scores))
	# Percentiles are returned as (interval, measure).
	percentiles = self._bootstrap_resample(score_matrix)
	# Extract the three intervals (low, mid, high).
	intervals = tuple(
	(scores[0].__class__(*percentiles[j, :]) for j in range(3)))
	result[score_type] = AggregateScore(
	low=intervals[0], mid=intervals[1], high=intervals[2])
	return result

	def _bootstrap_resample(self, matrix):
	"""Performs bootstrap resampling on a matrix of scores.

	Args:
	matrix: A 2-d matrix of (sample, measure).

	Returns:
	A 2-d matrix of (bounds, measure). There are three bounds: low (row 0),
	mid (row 1) and high (row 2). Mid is always the mean, while low and high
	bounds are specified by self._confidence_interval (which defaults to 0.95
	meaning it will return the 2.5th and 97.5th percentiles for a 95%
	confidence interval on the mean).
	"""

	# Matrix of (bootstrap sample, measure).
	sample_mean = np.zeros((self._n_samples, matrix.shape[1]))
	for i in range(self._n_samples):
	sample_idx = np.random.choice(
	np.arange(matrix.shape[0]), size=matrix.shape[0])
	sample = matrix[sample_idx, :]
	sample_mean[i, :] = np.mean(sample, axis=0)

	# Take percentiles on the estimate of the mean using bootstrap samples.
	# Final result is a (bounds, measure) matrix.
	percentile_delta = (1 - self._confidence_interval) / 2
	q = 100 * np.array([percentile_delta, 0.5, 1 - percentile_delta])
	return np.percentile(sample_mean, q, axis=0)


	class RougeRawOriginal:
	"""
	This is the original implementation of the ROUGERaw metric.
	Compute RougeRAW-1, RougeRAW-2, RougeRAW-L metrics.
	"""

	class FScore:
	"""F1 score representation."""

	def __init__(self, correct, gold, system):
	self.p = correct / system if system else 0.
	self.r = correct / gold if gold else 0.
	self.f = 2 * correct / (system + gold) if system + gold else 0.

	def _rouge_n(self, n, gold_words, system_words):
	"""Compute Rouge-n for given words."""

	def n_grams(n, words):
	ngrams = {}
	total = 0
	for i in range(len(words) - n + 1):
	ngram = "\t".join(words[i:i + n])
	ngrams[ngram] = 1 + ngrams.get(ngram, 0)
	total += 1
	return ngrams, total

	gold_ngrams, gold_total = n_grams(n, gold_words)
	system_ngrams, system_total = n_grams(n, system_words)

	intersection = 0
	for ngram in system_ngrams:
	intersection += min(system_ngrams[ngram], gold_ngrams.get(ngram, 0))

	return self.FScore(intersection, gold_total, system_total)

	def _rouge_l(self, gold_words, system_words):
	"""Compute Rouge-L for given words."""
	lcs = [[0] * len(system_words) for _ in gold_words]
	for r in range(len(gold_words)):
	for s in range(len(system_words)):
	if gold_words[r] == system_words[s]:
	lcs[r][s] = 1 + (lcs[r - 1][s - 1] if r and s else 0)
	lcs[r][s] = max(lcs[r][s], lcs[r - 1][s] if r else 0)
	lcs[r][s] = max(lcs[r][s], lcs[r][s - 1] if s else 0)

	return self.FScore(lcs[-1][-1], len(gold_words), len(system_words))

	def _tokenize(self, text):
	"""Tokenize given text."""
	return re.sub(r"\s+", " ", re.sub(r"\b", " ", text, re.UNICODE), re.UNICODE).strip().split(" ")

	def document(self, gold, system):
	"""Compute RougeRAW-1, RougeRAW-2, RougeRAW-L for given documents.
	Each document should be a string.
	"""

	assert isinstance(gold, str) and isinstance(system, str), "Expected string arguments"

	lc_gold_words = [word.lower() for word in self._tokenize(gold)]
	lc_system_words = [word.lower() for word in self._tokenize(system)]

	return {
	"1": self._rouge_n(1, lc_gold_words, lc_system_words),
	"2": self._rouge_n(2, lc_gold_words, lc_system_words),
	"L": self._rouge_l(lc_gold_words, lc_system_words),
	}

	def corpus(self, gold, system, aggregate=True):
	"""Compute RougeRAW-1, RougeRAW-2, RougeRAW-L for given corpora.
	Each corpus should be a collection of documents, each document a string.

	If aggregate is True, the lower, mid, and upper bounds of the confidence interval are returned.
	"""

	assert isinstance(gold, list) and isinstance(system, list), "Expected list arguments"
	assert len(gold) == len(system), "Given corpora should be of the same length"


	if aggregate:
	aggregator = BootstrapAggregator()
	else:
	rouge = {key: self.FScore(0, 0, 0) for key in ["1", "2", "L"]}

	if len(gold):
	for gold_document, system_document in zip(gold, system):
	for key, value in self.document(gold_document, system_document).items():
	if aggregate:
	aggregator.add_scores({
	key: Score(precision=value.p, recall=value.r, fmeasure=value.f)
	})
	else:
	rouge[key].p += value.p
	rouge[key].r += value.r
	rouge[key].f += value.f

	if not aggregate:
	for key in rouge:
	rouge[key].p /= len(gold)
	rouge[key].r /= len(gold)
	rouge[key].f /= len(gold)

	if aggregate:
	rouge = {}
	# convert the named tuple to a dict

	for k, ag_score in aggregator.aggregate().items():
	rouge[k + "_low_precision"] = float(ag_score.low.precision)
	rouge[k + "_low_recall"] = float(ag_score.low.recall)
	rouge[k + "_low_fmeasure"] = float(ag_score.low.fmeasure)

	rouge[k + "_mid_precision"] = float(ag_score.mid.precision)
	rouge[k + "_mid_recall"] = float(ag_score.mid.recall)
	rouge[k + "_mid_fmeasure"] = float(ag_score.mid.fmeasure)

	rouge[k + "_high_precision"] = float(ag_score.high.precision)
	rouge[k + "_high_recall"] = float(ag_score.high.recall)
	rouge[k + "_high_fmeasure"] = float(ag_score.high.fmeasure)

	return rouge


	_CITATION = """\
	@inproceedings{straka-etal-2018-sumeczech,
	title = "{S}ume{C}zech: Large {C}zech News-Based Summarization Dataset",
	author = "Straka, Milan and
	Mediankin, Nikita and
	Kocmi, Tom and
	{\v{Z}}abokrtsk{\'y}, Zden{\v{e}}k and
	Hude{\v{c}}ek, Vojt{\v{e}}ch and
	Haji{\v{c}}, Jan",
	editor = "Calzolari, Nicoletta and
	Choukri, Khalid and
	Cieri, Christopher and
	Declerck, Thierry and
	Goggi, Sara and
	Hasida, Koiti and
	Isahara, Hitoshi and
	Maegaard, Bente and
	Mariani, Joseph and
	Mazo, H{\'e}l{\`e}ne and
	Moreno, Asuncion and
	Odijk, Jan and
	Piperidis, Stelios and
	Tokunaga, Takenobu",
	booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
	month = may,
	year = "2018",
	address = "Miyazaki, Japan",
	publisher = "European Language Resources Association (ELRA)",
	url = "https://aclanthology.org/L18-1551",
	}
	"""

	_DESCRIPTION = """\
	ROUGE RAW is language-agnostic variant of ROUGE without stemmer, stop words and synonymas.
	This is a wrapper around the original http://hdl.handle.net/11234/1-2615 script.
	"""

	_KWARGS_DESCRIPTION = """
	ROCUE RAW metric for list of predictions and references.
	Args:
	predictions: list of predictions to evaluate. Each prediction should be a string with tokens separated by spaces.
	references: list of reference for each prediction. Each reference should be a string with tokens separated by spaces.
	select: (Optional) string. The name of the metric to return. One of: 'rougeraw1_precision', 'rougeraw1_recall', 'rougeraw1_fmeasure', 'rougeraw2_precision', 'rougeraw2_recall', 'rougeraw2_fmeasure', 'rougerawl_precision', 'rougerawl_recall', 'rougerawl_fmeasure'.
	If None, all metrics are returned as a dictionary.
	Returns:
	This metric outputs a dictionary, containing the scores.
	There are precision, recall, F1 values for rougeraw-1, rougeraw-2 and rougeraw-l. By default the bootstrapped confidence intervals are calculated, meaning that for each metric there are low, mid , high values specifying the confidence interval.

	Key format:
	```
	{1\|2\|l}_{low\|mid\|high}_{precision\|recall\|fmeasure}
	e.g.: 1_low_precision
	```

	If aggregate is False the format is:
	```
	{1\|2\|l}_{precision\|recall\|fmeasure}
	e.g.: 1_precision
	```
	Examples:
	>>> rougeraw = evaluate.load('CZLC/rouge_raw')
	>>> predictions = ["the cat is on the mat", "hello there"]
	>>> references = ["the cat is on the mat", "hello there"]
	>>> results = rougeraw.compute(predictions=predictions, references=references)
	>>> print(results)
	{'rougeraw1_precision': 1.0, 'rougeraw1_recall': 1.0, 'rougeraw1_fmeasure': 1.0, 'rougeraw2_precision': 1.0, 'rougeraw2_recall': 1.0, 'rougeraw2_fmeasure': 1.0, 'rougerawl_precision': 1.0, 'rougerawl_recall': 1.0, 'rougerawl_fmeasure': 1.0}
	"""


	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class RougeRaw(evaluate.Metric):
	def _info(self):
	return evaluate.MetricInfo(
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	features=[
	datasets.Features(
	{
	"predictions": datasets.Value("string", id="sequence"),
	"references": datasets.Value("string", id="sequence"),
	}
	),
	],
	reference_urls=[
	"http://hdl.handle.net/11234/1-2615",
	],
	)

	def _compute(self, predictions: Sequence[str], references: Sequence[str], select: Optional[str] = None,
	aggregate: bool = True):
	res = RougeRawOriginal().corpus(references, predictions, aggregate=aggregate)

	if not aggregate:
	res = {
	"1_precision": res["1"].p,
	"1_recall": res["1"].r,
	"1_fmeasure": res["1"].f,
	"2_precision": res["2"].p,
	"2_recall": res["2"].r,
	"2_fmeasure": res["2"].f,
	"L_precision": res["L"].p,
	"L_recall": res["L"].r,
	"L_fmeasure": res["L"].f,
	}

	if select is not None:
	return res[select]
	return res