# -*- coding: UTF-8 -*- """ Created on 02.02.24 Module for raw ROUGE score calculation from: @inproceedings{straka-etal-2018-sumeczech, title = "{S}ume{C}zech: Large {C}zech News-Based Summarization Dataset", author = "Straka, Milan and Mediankin, Nikita and Kocmi, Tom and {\v{Z}}abokrtsk{\'y}, Zden{\v{e}}k and Hude{\v{c}}ek, Vojt{\v{e}}ch and Haji{\v{c}}, Jan", editor = "Calzolari, Nicoletta and Choukri, Khalid and Cieri, Christopher and Declerck, Thierry and Goggi, Sara and Hasida, Koiti and Isahara, Hitoshi and Maegaard, Bente and Mariani, Joseph and Mazo, H{\'e}l{\`e}ne and Moreno, Asuncion and Odijk, Jan and Piperidis, Stelios and Tokunaga, Takenobu", booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)", month = may, year = "2018", address = "Miyazaki, Japan", publisher = "European Language Resources Association (ELRA)", url = "https://aclanthology.org/L18-1551", } :author: Martin Dočekal """ import collections import re from typing import Sequence, Optional import datasets import evaluate import numpy as np class AggregateScore(collections.namedtuple("AggregateScore", ["low", "mid", "high"])): """ Tuple containing confidence intervals for scores. Taken from: https://github.com/google-research/google-research/blob/master/rouge/scoring.py """ class Score( collections.namedtuple("Score", ["precision", "recall", "fmeasure"])): """Tuple containing precision, recall, and f-measure values.""" class BootstrapAggregator(object): """Aggregates scores to provide confidence intervals. Taken from: https://github.com/google-research/google-research/blob/master/rouge/scoring.py Sample usage: scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL']) aggregator = Aggregator() aggregator.add_scores(scorer.score("one two three", "one two")) aggregator.add_scores(scorer.score("one two five six", "seven eight")) result = aggregator.aggregate() print result {'rougeL': AggregateScore( low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.5, recall=0.33, fmeasure=0.40), high=Score(precision=1.0, recall=0.66, fmeasure=0.80)), 'rouge1': AggregateScore( low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.5, recall=0.33, fmeasure=0.40), high=Score(precision=1.0, recall=0.66, fmeasure=0.80))} """ def __init__(self, confidence_interval=0.95, n_samples=1000): """Initializes a BootstrapAggregator object. Args: confidence_interval: Confidence interval to compute on the mean as a decimal. n_samples: Number of samples to use for bootstrap resampling. Raises: ValueError: If invalid argument is given. """ if confidence_interval < 0 or confidence_interval > 1: raise ValueError("confidence_interval must be in range [0, 1]") if n_samples <= 0: raise ValueError("n_samples must be positive") self._n_samples = n_samples self._confidence_interval = confidence_interval self._scores = collections.defaultdict(list) def add_scores(self, scores): """Adds a sample for future aggregation. Args: scores: Dict mapping score_type strings to a namedtuple object/class representing a score. """ for score_type, score in scores.items(): self._scores[score_type].append(score) def aggregate(self): """Aggregates scores previously added using add_scores. Returns: A dict mapping score_type to AggregateScore objects. """ result = {} for score_type, scores in self._scores.items(): # Stack scores into a 2-d matrix of (sample, measure). score_matrix = np.vstack(tuple(scores)) # Percentiles are returned as (interval, measure). percentiles = self._bootstrap_resample(score_matrix) # Extract the three intervals (low, mid, high). intervals = tuple( (scores[0].__class__(*percentiles[j, :]) for j in range(3))) result[score_type] = AggregateScore( low=intervals[0], mid=intervals[1], high=intervals[2]) return result def _bootstrap_resample(self, matrix): """Performs bootstrap resampling on a matrix of scores. Args: matrix: A 2-d matrix of (sample, measure). Returns: A 2-d matrix of (bounds, measure). There are three bounds: low (row 0), mid (row 1) and high (row 2). Mid is always the mean, while low and high bounds are specified by self._confidence_interval (which defaults to 0.95 meaning it will return the 2.5th and 97.5th percentiles for a 95% confidence interval on the mean). """ # Matrix of (bootstrap sample, measure). sample_mean = np.zeros((self._n_samples, matrix.shape[1])) for i in range(self._n_samples): sample_idx = np.random.choice( np.arange(matrix.shape[0]), size=matrix.shape[0]) sample = matrix[sample_idx, :] sample_mean[i, :] = np.mean(sample, axis=0) # Take percentiles on the estimate of the mean using bootstrap samples. # Final result is a (bounds, measure) matrix. percentile_delta = (1 - self._confidence_interval) / 2 q = 100 * np.array([percentile_delta, 0.5, 1 - percentile_delta]) return np.percentile(sample_mean, q, axis=0) class RougeRawOriginal: """ This is the original implementation of the ROUGERaw metric. Compute RougeRAW-1, RougeRAW-2, RougeRAW-L metrics. """ class FScore: """F1 score representation.""" def __init__(self, correct, gold, system): self.p = correct / system if system else 0. self.r = correct / gold if gold else 0. self.f = 2 * correct / (system + gold) if system + gold else 0. def _rouge_n(self, n, gold_words, system_words): """Compute Rouge-n for given words.""" def n_grams(n, words): ngrams = {} total = 0 for i in range(len(words) - n + 1): ngram = "\t".join(words[i:i + n]) ngrams[ngram] = 1 + ngrams.get(ngram, 0) total += 1 return ngrams, total gold_ngrams, gold_total = n_grams(n, gold_words) system_ngrams, system_total = n_grams(n, system_words) intersection = 0 for ngram in system_ngrams: intersection += min(system_ngrams[ngram], gold_ngrams.get(ngram, 0)) return self.FScore(intersection, gold_total, system_total) def _rouge_l(self, gold_words, system_words): """Compute Rouge-L for given words.""" lcs = [[0] * len(system_words) for _ in gold_words] for r in range(len(gold_words)): for s in range(len(system_words)): if gold_words[r] == system_words[s]: lcs[r][s] = 1 + (lcs[r - 1][s - 1] if r and s else 0) lcs[r][s] = max(lcs[r][s], lcs[r - 1][s] if r else 0) lcs[r][s] = max(lcs[r][s], lcs[r][s - 1] if s else 0) return self.FScore(lcs[-1][-1], len(gold_words), len(system_words)) def _tokenize(self, text): """Tokenize given text.""" return re.sub(r"\s+", " ", re.sub(r"\b", " ", text, re.UNICODE), re.UNICODE).strip().split(" ") def document(self, gold, system): """Compute RougeRAW-1, RougeRAW-2, RougeRAW-L for given documents. Each document should be a string. """ assert isinstance(gold, str) and isinstance(system, str), "Expected string arguments" lc_gold_words = [word.lower() for word in self._tokenize(gold)] lc_system_words = [word.lower() for word in self._tokenize(system)] return { "1": self._rouge_n(1, lc_gold_words, lc_system_words), "2": self._rouge_n(2, lc_gold_words, lc_system_words), "L": self._rouge_l(lc_gold_words, lc_system_words), } def corpus(self, gold, system, aggregate=True): """Compute RougeRAW-1, RougeRAW-2, RougeRAW-L for given corpora. Each corpus should be a collection of documents, each document a string. If aggregate is True, the lower, mid, and upper bounds of the confidence interval are returned. """ assert isinstance(gold, list) and isinstance(system, list), "Expected list arguments" assert len(gold) == len(system), "Given corpora should be of the same length" if aggregate: aggregator = BootstrapAggregator() else: rouge = {key: self.FScore(0, 0, 0) for key in ["1", "2", "L"]} if len(gold): for gold_document, system_document in zip(gold, system): for key, value in self.document(gold_document, system_document).items(): if aggregate: aggregator.add_scores({ key: Score(precision=value.p, recall=value.r, fmeasure=value.f) }) else: rouge[key].p += value.p rouge[key].r += value.r rouge[key].f += value.f if not aggregate: for key in rouge: rouge[key].p /= len(gold) rouge[key].r /= len(gold) rouge[key].f /= len(gold) if aggregate: rouge = {} # convert the named tuple to a dict for k, ag_score in aggregator.aggregate().items(): rouge[k + "_low_precision"] = float(ag_score.low.precision) rouge[k + "_low_recall"] = float(ag_score.low.recall) rouge[k + "_low_fmeasure"] = float(ag_score.low.fmeasure) rouge[k + "_mid_precision"] = float(ag_score.mid.precision) rouge[k + "_mid_recall"] = float(ag_score.mid.recall) rouge[k + "_mid_fmeasure"] = float(ag_score.mid.fmeasure) rouge[k + "_high_precision"] = float(ag_score.high.precision) rouge[k + "_high_recall"] = float(ag_score.high.recall) rouge[k + "_high_fmeasure"] = float(ag_score.high.fmeasure) return rouge _CITATION = """\ @inproceedings{straka-etal-2018-sumeczech, title = "{S}ume{C}zech: Large {C}zech News-Based Summarization Dataset", author = "Straka, Milan and Mediankin, Nikita and Kocmi, Tom and {\v{Z}}abokrtsk{\'y}, Zden{\v{e}}k and Hude{\v{c}}ek, Vojt{\v{e}}ch and Haji{\v{c}}, Jan", editor = "Calzolari, Nicoletta and Choukri, Khalid and Cieri, Christopher and Declerck, Thierry and Goggi, Sara and Hasida, Koiti and Isahara, Hitoshi and Maegaard, Bente and Mariani, Joseph and Mazo, H{\'e}l{\`e}ne and Moreno, Asuncion and Odijk, Jan and Piperidis, Stelios and Tokunaga, Takenobu", booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)", month = may, year = "2018", address = "Miyazaki, Japan", publisher = "European Language Resources Association (ELRA)", url = "https://aclanthology.org/L18-1551", } """ _DESCRIPTION = """\ ROUGE RAW is language-agnostic variant of ROUGE without stemmer, stop words and synonymas. This is a wrapper around the original http://hdl.handle.net/11234/1-2615 script. """ _KWARGS_DESCRIPTION = """ ROCUE RAW metric for list of predictions and references. Args: predictions: list of predictions to evaluate. Each prediction should be a string with tokens separated by spaces. references: list of reference for each prediction. Each reference should be a string with tokens separated by spaces. select: (Optional) string. The name of the metric to return. One of: 'rougeraw1_precision', 'rougeraw1_recall', 'rougeraw1_fmeasure', 'rougeraw2_precision', 'rougeraw2_recall', 'rougeraw2_fmeasure', 'rougerawl_precision', 'rougerawl_recall', 'rougerawl_fmeasure'. If None, all metrics are returned as a dictionary. Returns: This metric outputs a dictionary, containing the scores. There are precision, recall, F1 values for rougeraw-1, rougeraw-2 and rougeraw-l. By default the bootstrapped confidence intervals are calculated, meaning that for each metric there are low, mid , high values specifying the confidence interval. Key format: ``` {1|2|l}_{low|mid|high}_{precision|recall|fmeasure} e.g.: 1_low_precision ``` If aggregate is False the format is: ``` {1|2|l}_{precision|recall|fmeasure} e.g.: 1_precision ``` Examples: >>> rougeraw = evaluate.load('CZLC/rouge_raw') >>> predictions = ["the cat is on the mat", "hello there"] >>> references = ["the cat is on the mat", "hello there"] >>> results = rougeraw.compute(predictions=predictions, references=references) >>> print(results) {'rougeraw1_precision': 1.0, 'rougeraw1_recall': 1.0, 'rougeraw1_fmeasure': 1.0, 'rougeraw2_precision': 1.0, 'rougeraw2_recall': 1.0, 'rougeraw2_fmeasure': 1.0, 'rougerawl_precision': 1.0, 'rougerawl_recall': 1.0, 'rougerawl_fmeasure': 1.0} """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class RougeRaw(evaluate.Metric): def _info(self): return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=[ datasets.Features( { "predictions": datasets.Value("string", id="sequence"), "references": datasets.Value("string", id="sequence"), } ), ], reference_urls=[ "http://hdl.handle.net/11234/1-2615", ], ) def _compute(self, predictions: Sequence[str], references: Sequence[str], select: Optional[str] = None, aggregate: bool = True): res = RougeRawOriginal().corpus(references, predictions, aggregate=aggregate) if not aggregate: res = { "1_precision": res["1"].p, "1_recall": res["1"].r, "1_fmeasure": res["1"].f, "2_precision": res["2"].p, "2_recall": res["2"].r, "2_fmeasure": res["2"].f, "L_precision": res["L"].p, "L_recall": res["L"].r, "L_fmeasure": res["L"].f, } if select is not None: return res[select] return res