# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Sem-NCG metric Author: Naman Bansal """ import statistics from dataclasses import dataclass from typing import List, Tuple, Union import datasets import evaluate import nltk import numpy as np from sklearn.metrics.pairwise import cosine_similarity from tqdm import tqdm from .encoder_models import get_sbert_encoder, get_encoder from .type_aliases import DEVICE_TYPE, NDArray, DOCUMENT_TYPE from .utils import get_gpu, flatten_list, slice_embeddings, is_nested_list_of_type, \ tokenize_and_prep_document _CITATION = """\ @inproceedings{akter-etal-2022-revisiting, title = "Revisiting Automatic Evaluation of Extractive Summarization Task: Can We Do Better than {ROUGE}?", author = "Akter, Mousumi and Bansal, Naman and Karmaker, Shubhra Kanti", editor = "Muresan, Smaranda and Nakov, Preslav and Villavicencio, Aline", booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", month = may, year = "2022", address = "Dublin, Ireland", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2022.findings-acl.122", doi = "10.18653/v1/2022.findings-acl.122", pages = "1547--1560", abstract = "It has been the norm for a long time to evaluate automated summarization tasks using the popular ROUGE metric. Although several studies in the past have highlighted the limitations of ROUGE, researchers have struggled to reach a consensus on a better alternative until today. One major limitation of the traditional ROUGE metric is the lack of semantic understanding (relies on direct overlap of n-grams). In this paper, we exclusively focus on the extractive summarization task and propose a semantic-aware nCG (normalized cumulative gain)-based evaluation metric (called Sem-nCG) for evaluating this task. One fundamental contribution of the paper is that it demonstrates how we can generate more reliable semantic-aware ground truths for evaluating extractive summarization tasks without any additional human intervention. To the best of our knowledge, this work is the first of its kind. We have conducted extensive experiments with this new metric using the widely used CNN/DailyMail dataset. Experimental results show that the new Sem-nCG metric is indeed semantic-aware, shows higher correlation with human judgement (more reliable) and yields a large number of disagreements with the original ROUGE metric (suggesting that ROUGE often leads to inaccurate conclusions also verified by humans).", } """ _DESCRIPTION = """\ Sem-nCG (Semantic Normalized Cumulative Gain) Metric evaluates the quality of predicted sentences (abstractive/extractive) in relation to reference sentences and documents using Semantic Normalized Cumulative Gain (NCG). It computes gain values and NCG scores based on cosine similarity between sentence embeddings, leveraging a Sentence-BERT encoder. This metric is designed to assess the relevance and ranking of predicted sentences, making it useful for tasks such as summarization and information retrieval. """ _KWARGS_DESCRIPTION = """ Sem-nCG (Semantic Normalized Cumulative Gain) compares the system-generated summaries (predictions) with ground truth reference summaries (references) and input documents (documents) using Semantic Normalized Cumulative Gain (NCG). It computes gain values and NCG scores based on sentence embeddings. Args: predictions (DOCUMENT_TYPE): The predicted sentences. `tokenize_sentences`=True -> predictions: List[str] `tokenize_sentences`=False -> predictions: List[List[str]] references (DOCUMENT_TYPE): The reference sentences. `tokenize_sentences`=True -> references: List[str] `tokenize_sentences`=False -> references: List[List[str]] documents (DOCUMENT_TYPE): Input documents. `tokenize_sentences`=True -> documents: List[str] `tokenize_sentences`=False -> documents: List[List[str]] k (int): The rank threshold used for evaluating gains (typically top-k sentences). Default is 3. gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation. bool - False - CPU (Default) True - GPU (device 0) if gpu is available else CPU int - n - GPU, device index n str - 'cuda', 'gpu', 'cpu' List[Union[str, int]] - Multiple GPUs/cpus i.e. use multiple processes when computing embeddings batch_size (int): Batch size for encoding. Default is 32. verbose (bool): Flag to indicate verbose output. Default is False. tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True. pre_compute_embeddings (bool): Flag to indicate whether to pre-compute embeddings for all sentences. This speeds up computation but requires more memory. Default is False. debug (bool): Flag to return detailed debug information including ranked gains. Default is False. Returns: Union[Tuple[float, List[float]], Tuple[float, List[RankedGains]]]: If `debug` is False, returns a tuple containing the mean SemnCG score and a list of SemnCG scores for each document. If `debug` is True, returns a tuple containing the mean SemnCG score and a list of `RankedGains` objects with detailed gain information for each document. Examples of input formats: Case 1: tokenize_sentences = True predictions: List[str] - List of predictions where each prediction is a document. references: List[str] - List of references where each reference is a document. documents: List[str] - List of input documents where each document is a document. Example: predictions = ["This is a prediction sentence 1. This is a prediction sentence 2."] references = ["This is a reference sentence 1. This is a reference sentence 2."] documents = ["This is a document sentence 1. This is a document sentence 2."] Case 2: tokenize_sentences = False predictions: List[List[str]] - List of predictions where each prediction is a list of sentences. references: List[List[str]] - List of references where each reference is a list of sentences. documents: List[List[str]] - List of input documents where each document is a list of sentences. Example: predictions = [["This is a prediction sentence 1.", "This is a prediction sentence 2."]] references = [["This is a reference sentence 1.", "This is a reference sentence 2."]] documents = [["This is a document sentence 1.", "This is a document sentence 2."]] Examples: >>> import evaluate >>> predictions = ["This is a prediction sentence 1. This is a prediction sentence 2."] >>> references = ["This is a reference sentence 1. This is a reference sentence 2."] >>> documents = ["This is a document sentence 1. This is a document sentence 2."] >>> metric = evaluate.load("nbansal/semncg", model_name="all-MiniLM-L6-v2") >>> mean_score, scores = metric.compute(predictions=predictions, references=references, documents=documents) >>> print(f"Mean SemnCG: {mean_score}") """ @dataclass class RankedGains: """ Dataclass to store ranked gains and associated metadata. Attributes: gt_gains (List[Tuple[str, float]]): List of tuples representing ground truth (ideal) gains, where each tuple contains a document sentence and its corresponding gain value. pred_gains (List[Tuple[str, float]]): List of tuples representing predicted gains by the model, where each tuple contains a document identifier and its corresponding gain value. k (int): The rank threshold used for evaluating gains (typically top-k documents). ncg (float): Normalized Cumulative Gain (NCG) score calculated based on the predicted gains compared to the ground truth gains. Notes: - `gt_gains` and `pred_gains` are typically sorted in descending order - `k` specifies the top-k threshold used for evaluating the gains. - `ncg` provides a normalized measure of the model's performance. """ gt_gains: List[Tuple[str, float]] pred_gains: List[Tuple[str, float]] k: int ncg: float def compute_cosine_similarity(doc_embeds: NDArray, ref_embeds: NDArray) -> List[float]: """ Compute cosine similarity scores between each document embedding and reference embeddings. Args: doc_embeds (NDArray): 2D array of shape (#Docs, Embedding_dim) containing document embeddings. ref_embeds (NDArray): 2D array of shape (#Refs, Embedding_dim) containing reference embeddings. Returns: List[float]: A list of mean cosine similarity scores between each document and reference embeddings. The length of the list is equal to the number of documents (#Docs). Notes: - Uses cosine_similarity function from sklearn.metrics.pairwise to compute pairwise cosine similarities. - Returns the mean cosine similarity scores across reference embeddings for each document embedding. """ # Compute cosine similarity between predicted and reference embeddings cosine_scores = cosine_similarity(doc_embeds, ref_embeds) # [#Docs, #Refs] return np.mean(cosine_scores, axis=1).tolist() def compute_gain(sim_scores: List[float]) -> List[Tuple[int, float]]: """ Compute gain values for ranked similarity scores. Args: sim_scores (List[float]): List of similarity scores for documents (`compute_cosine_similarity(doc_embeds, ref_embeds)`) Returns: List[Tuple[int, float]]: A list of tuples where each tuple contains a document index and its corresponding gain value. The list is sorted by descending order of gain values. Notes: - Computes gain values based on the rank order of similarity scores, where higher scores indicate higher gains. - Uses the formula: gain = rank_position / sum of ranks, where rank_position starts from 1 for the highest score - Returns a list sorted by descending gain values. """ count = len(sim_scores) sim_scores = np.array(sim_scores).argsort()[::-1] # Reverse Sorted Order of doc sentence indices denominator = count * (count + 1) / 2 # (n * (n+1))/2 return [(s_idx, val / denominator) for s_idx, val in zip(sim_scores, range(count, 0, -1))] def score_ncg(model_relevance: List[float], gt_relevance: List[float]) -> float: """ Calculate the Normalized Cumulative Gain (NCG) score based on model relevance and ground truth relevance. Args: model_relevance (List[float]): List of gain values representing the relevance scores predicted by the model. gt_relevance (List[float]): List of gain values representing the ground truth (ideal) relevance scores. Returns: float: Normalized Cumulative Gain (NCG) score, which measures the effectiveness of the model's relevance predictions compared to the ideal relevance scores. The score ranges from 0 to 1, where higher values indicate better performance. Notes: - Calculates Cumulative Gain (CG) for both model and ground truth relevance lists. - Normalizes CG scores by dividing model CG by ground truth CG to get the NCG score. - Returns 0 if the ground truth CG (icg) is 0 to avoid division by zero. """ # CG score cg = sum(model_relevance) # ICG score icg = sum(gt_relevance) # Normalized CG score return cg / icg if icg != 0 else 0 def compute_ncg(pred_gains: List[Tuple[int, float]], gt_gains: List[Tuple[int, float]], k: int) -> float: """ Compute the Normalized Cumulative Gain (NCG) score based on predicted and ground truth gains up to rank k. Args: pred_gains (List[Tuple[int, float]]): List of tuples representing predicted gains by the model, where each tuple contains a document position (or index) and its corresponding gain value. (Sorted in Descending Order) gt_gains (List[Tuple[int, float]]): List of tuples representing ground truth gains (ideal gains), where each tuple contains a document position (or index) and its corresponding gain value. (Sorted in Descending Order) k (int): The rank threshold used for evaluating gains (typically top-k documents). Returns: float: Normalized Cumulative Gain (NCG) score based on the predicted gains compared to the ground truth gains. Notes: - Both `pred_gains` and `gt_gains` should be sorted lists (in descending order) where higher gain values indicate higher relevance. - The function calculates NCG up to rank `k`, considering only the top-k documents. - Uses the `score_ncg` function to compute the NCG score based on the model's predicted gains and the ground truth. """ gt_dict = dict(gt_gains) gt_rel = [v for _, v in gt_gains[:k]] model_rel = [gt_dict[position] for position, _ in pred_gains[:k]] return score_ncg(model_rel, gt_rel) def _validate_input_format( tokenize_sentences: bool, predictions: DOCUMENT_TYPE, references: DOCUMENT_TYPE, documents: DOCUMENT_TYPE ): """ Validate the format of predictions, references, and documents based on specified criteria. Args: tokenize_sentences (bool): Flag indicating whether sentences should be tokenized. predictions (DOCUMENT_TYPE): Predictions to validate. references (DOCUMENT_TYPE): References to validate. documents (DOCUMENT_TYPE): Documents to validate. Raises: ValueError: If the format of predictions, references, or documents does not meet the specified criteria. Validation Criteria: The function validates predictions, references, and documents based on the following conditions: 1. If `tokenize_sentences` is True: - Predictions, references, and documents must all be lists of strings (`is_list_of_strings_at_depth(obj, 1)`). 2. If `tokenize_sentences` is False: - Predictions, references, and documents must all be lists of lists of strings (`is_list_of_strings_at_depth(obj, 2)`). The function checks these conditions and raises a ValueError if any condition is not met, indicating that predictions, references, or documents are not in the valid input format. Notes: - `DOCUMENT_TYPE`: Union[List[str], List[List[str]]] - Uses helper function `is_list_of_strings_at_depth` to validate the format of lists of strings. Example: >>> tokenize_sentences = True >>> predictions = ["This is prediction 1.", "This is prediction 2."] >>> references = ["Reference for prediction 1.", "Reference for prediction 2."] >>> documents = ["Document 1 content.", "Document 2 content."] >>> _validate_input_format(tokenize_sentences, predictions, references, documents) Example: >>> tokenize_sentences = False >>> predictions = [["Sentence 1 in prediction 1.", "Sentence 2 in prediction 1."], >>> ["Sentence 1 in prediction 2.", "Sentence 2 in prediction 2."]] >>> references = [["Sentences in reference 1."], ["Sentences in reference 2."]] >>> documents = [["Sentence 1 in document 1.", "Sentence 2 in document 1."], >>> ["Sentence 1 in document 2.", "Sentence 2 in document 2."]] >>> _validate_input_format(tokenize_sentences, predictions, references, documents) """ if not (len(predictions) == len(references) == len(documents)): raise ValueError( f"Predictions, References and Documents must have the same length. " f"Got {len(predictions)} predictions, {len(references)} references and {len(documents)} documents." ) if len(predictions) == 0: raise ValueError("Can't have empty inputs") def check_format(lst_obj, expected_depth: int, name: str): is_valid, error_message = is_nested_list_of_type(lst_obj, element_type=str, depth=expected_depth) if not is_valid: raise ValueError(f"{name} are not in the expected format.\n" f"Error: {error_message}.") try: if tokenize_sentences: check_format(predictions, expected_depth=1, name="predictions") check_format(references, expected_depth=1, name="references") check_format(documents, expected_depth=1, name="documents") else: check_format(predictions, expected_depth=2, name="predictions") check_format(references, expected_depth=2, name="references") check_format(documents, expected_depth=2, name="documents") except ValueError as ve: raise ValueError(f"Input validation error: {ve}") @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class SemNCG(evaluate.Metric): """ SemnCG (Semantic Normalized Cumulative Gain) Metric. This metric evaluates the quality of predicted sentences in relation to reference sentences and documents using Semantic Normalized Cumulative Gain (NCG). It computes the gain values and NCG scores based on cosine similarity between sentence embeddings, leveraging a Sentence-BERT encoder. """ def __init__(self, model_name: str = "all-MiniLM-L6-v2", **kwargs): self.sbert_encoder = get_sbert_encoder(model_name) super().__init__(**kwargs) def _info(self): # TODO: Specifies the evaluate.EvaluationModuleInfo object return evaluate.MetricInfo( # This is the description that will appear on the modules page. module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=[ # Tokenize_Sentences = True datasets.Features( { "predictions": datasets.Value("string"), "references": datasets.Value("string"), "documents": datasets.Value("string"), } ), # Tokenize_Sentences = False datasets.Features( { "predictions": datasets.Sequence(datasets.Value("string", id="sequence"), id="predictions"), "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), "documents": datasets.Sequence(datasets.Value("string", id="sequence"), id="documents"), } ), ], # # Homepage of the module for documentation # homepage="http://module.homepage", # # Additional links to the codebase or references # codebase_urls=["http://github.com/path/to/codebase/of/new_module"], reference_urls=["https://aclanthology.org/2022.findings-acl.122/"] ) def _download_and_prepare(self, dl_manager): """Optional: download external resources useful to compute the scores""" nltk.download("punkt", quiet=True) def _compute( self, predictions: DOCUMENT_TYPE, references: DOCUMENT_TYPE, documents: DOCUMENT_TYPE, k: int = 3, gpu: DEVICE_TYPE = False, verbose: bool = False, batch_size: int = 32, tokenize_sentences: bool = True, pre_compute_embeddings: bool = False, debug: bool = False, ) -> Union[Tuple[float, List[float]], Tuple[float, List[RankedGains]]]: """ Compute the Semantic Normalized Cumulative Gain (SemnCG) score. Args: predictions (DOCUMENT_TYPE): The predicted sentences. `tokenize_sentences`=True -> predictions: List[str] `tokenize_sentences`=False -> predictions: List[List[str]] references (DOCUMENT_TYPE): The reference sentences. `tokenize_sentences`=True -> references: List[str] `tokenize_sentences`=False -> references: List[List[str]] documents (DOCUMENT_TYPE): Input documents. `tokenize_sentences`=True -> references: List[str] `tokenize_sentences`=False -> references: List[List[str]] k (int, optional): The rank threshold used for evaluating gains (typically top-k sentences). Default is 3. gpu (DEVICE_TYPE, optional): Whether to use GPU for computation. Default is False. verbose (bool, optional): Whether to print verbose logs. Default is False. batch_size (int, optional): The batch size for encoding sentences. Default is 32. tokenize_sentences (bool, optional): Whether to tokenize sentences. If True, sentences are tokenized before processing. Default is True. pre_compute_embeddings (bool, optional): Whether to pre-compute embeddings for all sentences. This speeds up computation but requires more memory. Default is False. debug (bool, optional): Whether to return detailed debug information including ranked gains. Default=False. Returns: Union[Tuple[float, List[float]], Tuple[float, List[RankedGains]]]: If `debug` is False, returns a tuple containing the mean SemnCG score and a list of SemnCG scores for each document. If `debug` is True, returns a tuple containing the mean SemnCG score and a list of `RankedGains` objects with detailed gain information for each document. Raises: ValueError: If the format of predictions, references, or documents does not meet the specified criteria. Notes: - Validates the format of predictions, references, and documents based on `tokenize_sentences`. - Computes embeddings using a Sentence-BERT encoder. - Computes cosine similarity between document, reference, and prediction embeddings. - Calculates gain values and Normalized Cumulative Gain (NCG) scores. - Optionally returns detailed debug information for each document if `debug` is True. """ # Validate inputs corresponding to flags _validate_input_format(tokenize_sentences, predictions, references, documents) # Get GPU device = get_gpu(gpu) if verbose: print(f"Using devices: {device}") # Get model encoder = get_encoder(self.sbert_encoder, device=device, batch_size=batch_size, verbose=verbose) if pre_compute_embeddings: # fast but takes more memory predictions = [tokenize_and_prep_document(pred, tokenize_sentences) for pred in predictions] references = [tokenize_and_prep_document(ref, tokenize_sentences) for ref in references] documents = [tokenize_and_prep_document(doc, tokenize_sentences) for doc in documents] # This is only done for debug case sent_tokenized_documents = documents # Compute All Embeddings all_sentences = flatten_list(documents) + flatten_list(references) + flatten_list(predictions) embeddings = encoder.encode(all_sentences) prediction_sentences_count = [len(pred) for pred in predictions] reference_sentences_count = [len(ref) for ref in references] document_sentences_count = [len(doc) for doc in documents] # Get embeddings corresponding to documents, references and predictions (IN ORDER) doc_embeddings = slice_embeddings(embeddings, document_sentences_count) ref_embeddings = slice_embeddings(embeddings[sum(document_sentences_count):], reference_sentences_count) pred_embeddings = slice_embeddings( embeddings[sum(document_sentences_count + reference_sentences_count):], prediction_sentences_count ) iterable_obj = zip(pred_embeddings, ref_embeddings, doc_embeddings) else: iterable_obj = zip(predictions, references, documents) out = [] for idx, (pred, ref, doc) in enumerate(tqdm(iterable_obj)): if not pre_compute_embeddings: # Compute embeddings ref_sentences = tokenize_and_prep_document(ref, tokenize_sentences) pred_sentences = tokenize_and_prep_document(pred, tokenize_sentences) doc_sentences = tokenize_and_prep_document(doc, tokenize_sentences) # Compute Embeddings doc_sentence_count = len(doc_sentences) ref_sentence_count = len(ref_sentences) all_sentences = doc_sentences + ref_sentences + pred_sentences embeddings = encoder.encode(all_sentences) doc_embeddings = embeddings[:doc_sentence_count] ref_embeddings = embeddings[doc_sentence_count:doc_sentence_count + ref_sentence_count] pred_embeddings = embeddings[doc_sentence_count + ref_sentence_count:] else: # we already have embeddings doc_embeddings = doc ref_embeddings = ref pred_embeddings = pred doc_sentences = sent_tokenized_documents[idx] # Compute Pair-Wise Cosine Similarity ref_sim_scores = compute_cosine_similarity(doc_embeddings, ref_embeddings) pred_sim_scores = compute_cosine_similarity(doc_embeddings, pred_embeddings) # Compute Gains ground_truth_gain = compute_gain(ref_sim_scores) # this is used to compute top-predicted sentence indices pred_gain = compute_gain(pred_sim_scores) real_k = min(len(pred_gain), k) # Compute NCG Scores ncg_score = compute_ncg(pred_gain, ground_truth_gain, real_k) if debug: ground_truth_gain = [(doc_sentences[sent_idx], gain_val) for sent_idx, gain_val in ground_truth_gain] pred_gain = [(doc_sentences[sent_idx], gain_val) for sent_idx, gain_val in pred_gain] out.append(RankedGains(ground_truth_gain, pred_gain, k=real_k, ncg=ncg_score)) else: out.append(ncg_score) if debug: return statistics.mean([ele.ncg for ele in out]), out return statistics.mean(out), out