Sem-nCG / semncg.py
nbansal's picture
Added author
c2e3dae
raw
history blame
27.3 kB
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Sem-NCG metric
Author: Naman Bansal
"""
import statistics
from dataclasses import dataclass
from typing import List, Tuple, Union
import datasets
import evaluate
import nltk
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from .encoder_models import get_sbert_encoder, get_encoder
from .type_aliases import DEVICE_TYPE, NDArray, DOCUMENT_TYPE
from .utils import get_gpu, flatten_list, slice_embeddings, is_nested_list_of_type, \
tokenize_and_prep_document
_CITATION = """\
@inproceedings{akter-etal-2022-revisiting,
title = "Revisiting Automatic Evaluation of Extractive Summarization Task: Can We Do Better than {ROUGE}?",
author = "Akter, Mousumi and
Bansal, Naman and
Karmaker, Shubhra Kanti",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2022",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-acl.122",
doi = "10.18653/v1/2022.findings-acl.122",
pages = "1547--1560",
abstract = "It has been the norm for a long time to evaluate automated summarization tasks using the popular ROUGE metric. Although several studies in the past have highlighted the limitations of ROUGE, researchers have struggled to reach a consensus on a better alternative until today. One major limitation of the traditional ROUGE metric is the lack of semantic understanding (relies on direct overlap of n-grams). In this paper, we exclusively focus on the extractive summarization task and propose a semantic-aware nCG (normalized cumulative gain)-based evaluation metric (called Sem-nCG) for evaluating this task. One fundamental contribution of the paper is that it demonstrates how we can generate more reliable semantic-aware ground truths for evaluating extractive summarization tasks without any additional human intervention. To the best of our knowledge, this work is the first of its kind. We have conducted extensive experiments with this new metric using the widely used CNN/DailyMail dataset. Experimental results show that the new Sem-nCG metric is indeed semantic-aware, shows higher correlation with human judgement (more reliable) and yields a large number of disagreements with the original ROUGE metric (suggesting that ROUGE often leads to inaccurate conclusions also verified by humans).",
}
"""
_DESCRIPTION = """\
Sem-nCG (Semantic Normalized Cumulative Gain) Metric evaluates the quality of predicted sentences
(abstractive/extractive) in relation to reference sentences and documents using Semantic Normalized Cumulative Gain
(NCG). It computes gain values and NCG scores based on cosine similarity between sentence embeddings, leveraging a
Sentence-BERT encoder. This metric is designed to assess the relevance and ranking of predicted sentences, making it
useful for tasks such as summarization and information retrieval.
"""
_KWARGS_DESCRIPTION = """
Sem-nCG (Semantic Normalized Cumulative Gain) compares the system-generated summaries (predictions) with ground truth
reference summaries (references) and input documents (documents) using Semantic Normalized Cumulative Gain (NCG).
It computes gain values and NCG scores based on sentence embeddings.
Args:
predictions (DOCUMENT_TYPE): The predicted sentences.
`tokenize_sentences`=True -> predictions: List[str]
`tokenize_sentences`=False -> predictions: List[List[str]]
references (DOCUMENT_TYPE): The reference sentences.
`tokenize_sentences`=True -> references: List[str]
`tokenize_sentences`=False -> references: List[List[str]]
documents (DOCUMENT_TYPE): Input documents.
`tokenize_sentences`=True -> documents: List[str]
`tokenize_sentences`=False -> documents: List[List[str]]
k (int): The rank threshold used for evaluating gains (typically top-k sentences). Default is 3.
gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation.
bool -
False - CPU (Default)
True - GPU (device 0) if gpu is available else CPU
int -
n - GPU, device index n
str -
'cuda', 'gpu', 'cpu'
List[Union[str, int]] - Multiple GPUs/cpus i.e. use multiple processes when computing embeddings
batch_size (int): Batch size for encoding. Default is 32.
verbose (bool): Flag to indicate verbose output. Default is False.
tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
pre_compute_embeddings (bool): Flag to indicate whether to pre-compute embeddings for all sentences. This speeds up
computation but requires more memory. Default is False.
debug (bool): Flag to return detailed debug information including ranked gains. Default is False.
Returns:
Union[Tuple[float, List[float]], Tuple[float, List[RankedGains]]]:
If `debug` is False, returns a tuple containing the mean SemnCG score and a list of SemnCG scores for each document.
If `debug` is True, returns a tuple containing the mean SemnCG score and a list of `RankedGains` objects with
detailed gain information for each document.
Examples of input formats:
Case 1: tokenize_sentences = True
predictions: List[str] - List of predictions where each prediction is a document.
references: List[str] - List of references where each reference is a document.
documents: List[str] - List of input documents where each document is a document.
Example:
predictions = ["This is a prediction sentence 1. This is a prediction sentence 2."]
references = ["This is a reference sentence 1. This is a reference sentence 2."]
documents = ["This is a document sentence 1. This is a document sentence 2."]
Case 2: tokenize_sentences = False
predictions: List[List[str]] - List of predictions where each prediction is a list of sentences.
references: List[List[str]] - List of references where each reference is a list of sentences.
documents: List[List[str]] - List of input documents where each document is a list of sentences.
Example:
predictions = [["This is a prediction sentence 1.", "This is a prediction sentence 2."]]
references = [["This is a reference sentence 1.", "This is a reference sentence 2."]]
documents = [["This is a document sentence 1.", "This is a document sentence 2."]]
Examples:
>>> import evaluate
>>> predictions = ["This is a prediction sentence 1. This is a prediction sentence 2."]
>>> references = ["This is a reference sentence 1. This is a reference sentence 2."]
>>> documents = ["This is a document sentence 1. This is a document sentence 2."]
>>> metric = evaluate.load("nbansal/semncg", model_name="all-MiniLM-L6-v2")
>>> mean_score, scores = metric.compute(predictions=predictions, references=references, documents=documents)
>>> print(f"Mean SemnCG: {mean_score}")
"""
@dataclass
class RankedGains:
"""
Dataclass to store ranked gains and associated metadata.
Attributes:
gt_gains (List[Tuple[str, float]]): List of tuples representing ground truth (ideal) gains,
where each tuple contains a document sentence and its corresponding gain value.
pred_gains (List[Tuple[str, float]]): List of tuples representing predicted gains by the model,
where each tuple contains a document identifier and its corresponding gain value.
k (int): The rank threshold used for evaluating gains (typically top-k documents).
ncg (float): Normalized Cumulative Gain (NCG) score calculated based on the predicted gains
compared to the ground truth gains.
Notes:
- `gt_gains` and `pred_gains` are typically sorted in descending order
- `k` specifies the top-k threshold used for evaluating the gains.
- `ncg` provides a normalized measure of the model's performance.
"""
gt_gains: List[Tuple[str, float]]
pred_gains: List[Tuple[str, float]]
k: int
ncg: float
def compute_cosine_similarity(doc_embeds: NDArray, ref_embeds: NDArray) -> List[float]:
"""
Compute cosine similarity scores between each document embedding and reference embeddings.
Args:
doc_embeds (NDArray): 2D array of shape (#Docs, Embedding_dim) containing document embeddings.
ref_embeds (NDArray): 2D array of shape (#Refs, Embedding_dim) containing reference embeddings.
Returns:
List[float]: A list of mean cosine similarity scores between each document and reference embeddings.
The length of the list is equal to the number of documents (#Docs).
Notes:
- Uses cosine_similarity function from sklearn.metrics.pairwise to compute pairwise cosine similarities.
- Returns the mean cosine similarity scores across reference embeddings for each document embedding.
"""
# Compute cosine similarity between predicted and reference embeddings
cosine_scores = cosine_similarity(doc_embeds, ref_embeds) # [#Docs, #Refs]
return np.mean(cosine_scores, axis=1).tolist()
def compute_gain(sim_scores: List[float]) -> List[Tuple[int, float]]:
"""
Compute gain values for ranked similarity scores.
Args:
sim_scores (List[float]): List of similarity scores for documents (`compute_cosine_similarity(doc_embeds, ref_embeds)`)
Returns:
List[Tuple[int, float]]: A list of tuples where each tuple contains a document index and its corresponding gain
value. The list is sorted by descending order of gain values.
Notes:
- Computes gain values based on the rank order of similarity scores, where higher scores indicate higher gains.
- Uses the formula: gain = rank_position / sum of ranks, where rank_position starts from 1 for the highest score
- Returns a list sorted by descending gain values.
"""
count = len(sim_scores)
sim_scores = np.array(sim_scores).argsort()[::-1] # Reverse Sorted Order of doc sentence indices
denominator = count * (count + 1) / 2 # (n * (n+1))/2
return [(s_idx, val / denominator) for s_idx, val in zip(sim_scores, range(count, 0, -1))]
def score_ncg(model_relevance: List[float], gt_relevance: List[float]) -> float:
"""
Calculate the Normalized Cumulative Gain (NCG) score based on model relevance and ground truth relevance.
Args:
model_relevance (List[float]): List of gain values representing the relevance scores predicted by the model.
gt_relevance (List[float]): List of gain values representing the ground truth (ideal) relevance scores.
Returns:
float: Normalized Cumulative Gain (NCG) score, which measures the effectiveness of the model's relevance
predictions compared to the ideal relevance scores. The score ranges from 0 to 1, where higher values
indicate better performance.
Notes:
- Calculates Cumulative Gain (CG) for both model and ground truth relevance lists.
- Normalizes CG scores by dividing model CG by ground truth CG to get the NCG score.
- Returns 0 if the ground truth CG (icg) is 0 to avoid division by zero.
"""
# CG score
cg = sum(model_relevance)
# ICG score
icg = sum(gt_relevance)
# Normalized CG score
return cg / icg if icg != 0 else 0
def compute_ncg(pred_gains: List[Tuple[int, float]], gt_gains: List[Tuple[int, float]], k: int) -> float:
"""
Compute the Normalized Cumulative Gain (NCG) score based on predicted and ground truth gains up to rank k.
Args:
pred_gains (List[Tuple[int, float]]): List of tuples representing predicted gains by the model,
where each tuple contains a document position (or index) and its corresponding gain value.
(Sorted in Descending Order)
gt_gains (List[Tuple[int, float]]): List of tuples representing ground truth gains (ideal gains),
where each tuple contains a document position (or index) and its corresponding gain value.
(Sorted in Descending Order)
k (int): The rank threshold used for evaluating gains (typically top-k documents).
Returns:
float: Normalized Cumulative Gain (NCG) score based on the predicted gains compared to the ground truth gains.
Notes:
- Both `pred_gains` and `gt_gains` should be sorted lists (in descending order) where higher gain values indicate
higher relevance.
- The function calculates NCG up to rank `k`, considering only the top-k documents.
- Uses the `score_ncg` function to compute the NCG score based on the model's predicted gains and the ground
truth.
"""
gt_dict = dict(gt_gains)
gt_rel = [v for _, v in gt_gains[:k]]
model_rel = [gt_dict[position] for position, _ in pred_gains[:k]]
return score_ncg(model_rel, gt_rel)
def _validate_input_format(
tokenize_sentences: bool,
predictions: DOCUMENT_TYPE,
references: DOCUMENT_TYPE,
documents: DOCUMENT_TYPE
):
"""
Validate the format of predictions, references, and documents based on specified criteria.
Args:
tokenize_sentences (bool): Flag indicating whether sentences should be tokenized.
predictions (DOCUMENT_TYPE): Predictions to validate.
references (DOCUMENT_TYPE): References to validate.
documents (DOCUMENT_TYPE): Documents to validate.
Raises:
ValueError: If the format of predictions, references, or documents does not meet the specified criteria.
Validation Criteria:
The function validates predictions, references, and documents based on the following conditions:
1. If `tokenize_sentences` is True:
- Predictions, references, and documents must all be lists of strings (`is_list_of_strings_at_depth(obj, 1)`).
2. If `tokenize_sentences` is False:
- Predictions, references, and documents must all be lists of lists of strings
(`is_list_of_strings_at_depth(obj, 2)`).
The function checks these conditions and raises a ValueError if any condition is not met,
indicating that predictions, references, or documents are not in the valid input format.
Notes:
- `DOCUMENT_TYPE`: Union[List[str], List[List[str]]]
- Uses helper function `is_list_of_strings_at_depth` to validate the format of lists of strings.
Example:
>>> tokenize_sentences = True
>>> predictions = ["This is prediction 1.", "This is prediction 2."]
>>> references = ["Reference for prediction 1.", "Reference for prediction 2."]
>>> documents = ["Document 1 content.", "Document 2 content."]
>>> _validate_input_format(tokenize_sentences, predictions, references, documents)
Example:
>>> tokenize_sentences = False
>>> predictions = [["Sentence 1 in prediction 1.", "Sentence 2 in prediction 1."],
>>> ["Sentence 1 in prediction 2.", "Sentence 2 in prediction 2."]]
>>> references = [["Sentences in reference 1."], ["Sentences in reference 2."]]
>>> documents = [["Sentence 1 in document 1.", "Sentence 2 in document 1."],
>>> ["Sentence 1 in document 2.", "Sentence 2 in document 2."]]
>>> _validate_input_format(tokenize_sentences, predictions, references, documents)
"""
if not (len(predictions) == len(references) == len(documents)):
raise ValueError("Predictions, References and Documents must have the same length.")
if len(predictions) == 0:
raise ValueError("Can't have empty inputs")
def is_list_of_strings_at_depth(lst_obj, depth: int):
return is_nested_list_of_type(lst_obj, element_type=str, depth=depth)
if tokenize_sentences:
condition = (
is_list_of_strings_at_depth(predictions, 1) and
is_list_of_strings_at_depth(references, 1) and
is_list_of_strings_at_depth(documents, 1)
)
else:
condition = (
is_list_of_strings_at_depth(predictions, 2) and
is_list_of_strings_at_depth(references, 2) and
is_list_of_strings_at_depth(documents, 2)
)
if not condition:
raise ValueError("Predictions, References and Documents are not valid input format. Refer to documentation.")
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class SemNCG(evaluate.Metric):
"""
SemnCG (Semantic Normalized Cumulative Gain) Metric.
This metric evaluates the quality of predicted sentences in relation to reference sentences and documents
using Semantic Normalized Cumulative Gain (NCG). It computes the gain values and NCG scores based on
cosine similarity between sentence embeddings, leveraging a Sentence-BERT encoder.
"""
def __init__(self, model_name: str = "all-MiniLM-L6-v2", **kwargs):
self.sbert_encoder = get_sbert_encoder(model_name)
super().__init__(**kwargs)
def _info(self):
# TODO: Specifies the evaluate.EvaluationModuleInfo object
return evaluate.MetricInfo(
# This is the description that will appear on the modules page.
module_type="metric",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
# This defines the format of each prediction and reference
features=[
# Tokenize_Sentences = True
datasets.Features(
{
"predictions": datasets.Value("string"),
"references": datasets.Value("string"),
"documents": datasets.Value("string"),
}
),
# Tokenize_Sentences = False
datasets.Features(
{
"predictions": datasets.Sequence(datasets.Value("string", id="sequence"), id="predictions"),
"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
"documents": datasets.Sequence(datasets.Value("string", id="sequence"), id="documents"),
}
),
],
# # Homepage of the module for documentation
# homepage="http://module.homepage",
# # Additional links to the codebase or references
# codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
reference_urls=["https://aclanthology.org/2022.findings-acl.122/"]
)
def _download_and_prepare(self, dl_manager):
"""Optional: download external resources useful to compute the scores"""
nltk.download("punkt", quiet=True)
def _compute(
self,
predictions: DOCUMENT_TYPE,
references: DOCUMENT_TYPE,
documents: DOCUMENT_TYPE,
k: int = 3,
gpu: DEVICE_TYPE = False,
verbose: bool = False,
batch_size: int = 32,
tokenize_sentences: bool = True,
pre_compute_embeddings: bool = False,
debug: bool = False,
) -> Union[Tuple[float, List[float]], Tuple[float, List[RankedGains]]]:
"""
Compute the Semantic Normalized Cumulative Gain (SemnCG) score.
Args:
predictions (DOCUMENT_TYPE): The predicted sentences.
`tokenize_sentences`=True -> predictions: List[str]
`tokenize_sentences`=False -> predictions: List[List[str]]
references (DOCUMENT_TYPE): The reference sentences.
`tokenize_sentences`=True -> references: List[str]
`tokenize_sentences`=False -> references: List[List[str]]
documents (DOCUMENT_TYPE): Input documents.
`tokenize_sentences`=True -> references: List[str]
`tokenize_sentences`=False -> references: List[List[str]]
k (int, optional): The rank threshold used for evaluating gains (typically top-k sentences). Default is 3.
gpu (DEVICE_TYPE, optional): Whether to use GPU for computation. Default is False.
verbose (bool, optional): Whether to print verbose logs. Default is False.
batch_size (int, optional): The batch size for encoding sentences. Default is 32.
tokenize_sentences (bool, optional): Whether to tokenize sentences. If True, sentences are tokenized before
processing. Default is True.
pre_compute_embeddings (bool, optional): Whether to pre-compute embeddings for all sentences. This speeds up
computation but requires more memory. Default is False.
debug (bool, optional): Whether to return detailed debug information including ranked gains. Default=False.
Returns:
Union[Tuple[float, List[float]], Tuple[float, List[RankedGains]]]:
If `debug` is False, returns a tuple containing the mean SemnCG score and a list of SemnCG scores for each document.
If `debug` is True, returns a tuple containing the mean SemnCG score and a list of `RankedGains` objects with detailed gain information for each document.
Raises:
ValueError: If the format of predictions, references, or documents does not meet the specified criteria.
Notes:
- Validates the format of predictions, references, and documents based on `tokenize_sentences`.
- Computes embeddings using a Sentence-BERT encoder.
- Computes cosine similarity between document, reference, and prediction embeddings.
- Calculates gain values and Normalized Cumulative Gain (NCG) scores.
- Optionally returns detailed debug information for each document if `debug` is True.
"""
# Validate inputs corresponding to flags
_validate_input_format(tokenize_sentences, predictions, references, documents)
# Get GPU
device = get_gpu(gpu)
if verbose:
print(f"Using devices: {device}")
# Get model
encoder = get_encoder(self.sbert_encoder, device=device, batch_size=batch_size, verbose=verbose)
if pre_compute_embeddings: # fast but takes more memory
predictions = [tokenize_and_prep_document(pred, tokenize_sentences) for pred in predictions]
references = [tokenize_and_prep_document(ref, tokenize_sentences) for ref in references]
documents = [tokenize_and_prep_document(doc, tokenize_sentences) for doc in documents]
# This is only done for debug case
sent_tokenized_documents = documents
# Compute All Embeddings
all_sentences = flatten_list(documents) + flatten_list(references) + flatten_list(predictions)
embeddings = encoder.encode(all_sentences)
prediction_sentences_count = [len(pred) for pred in predictions]
reference_sentences_count = [len(ref) for ref in references]
document_sentences_count = [len(doc) for doc in documents]
# Get embeddings corresponding to documents, references and predictions (IN ORDER)
doc_embeddings = slice_embeddings(embeddings, document_sentences_count)
ref_embeddings = slice_embeddings(embeddings[sum(document_sentences_count):], reference_sentences_count)
pred_embeddings = slice_embeddings(
embeddings[sum(document_sentences_count + reference_sentences_count):], prediction_sentences_count
)
iterable_obj = zip(pred_embeddings, ref_embeddings, doc_embeddings)
else:
iterable_obj = zip(predictions, references, documents)
out = []
for idx, (pred, ref, doc) in enumerate(tqdm(iterable_obj)):
if not pre_compute_embeddings: # Compute embeddings
ref_sentences = tokenize_and_prep_document(ref, tokenize_sentences)
pred_sentences = tokenize_and_prep_document(pred, tokenize_sentences)
doc_sentences = tokenize_and_prep_document(doc, tokenize_sentences)
# Compute Embeddings
doc_sentence_count = len(doc_sentences)
ref_sentence_count = len(ref_sentences)
all_sentences = doc_sentences + ref_sentences + pred_sentences
embeddings = encoder.encode(all_sentences)
doc_embeddings = embeddings[:doc_sentence_count]
ref_embeddings = embeddings[doc_sentence_count:doc_sentence_count + ref_sentence_count]
pred_embeddings = embeddings[doc_sentence_count + ref_sentence_count:]
else: # we already have embeddings
doc_embeddings = doc
ref_embeddings = ref
pred_embeddings = pred
doc_sentences = sent_tokenized_documents[idx]
# Compute Pair-Wise Cosine Similarity
ref_sim_scores = compute_cosine_similarity(doc_embeddings, ref_embeddings)
pred_sim_scores = compute_cosine_similarity(doc_embeddings, pred_embeddings)
# Compute Gains
ground_truth_gain = compute_gain(ref_sim_scores)
# this is used to compute top-predicted sentence indices
pred_gain = compute_gain(pred_sim_scores)
real_k = min(len(pred_gain), k)
# Compute NCG Scores
ncg_score = compute_ncg(pred_gain, ground_truth_gain, real_k)
if debug:
ground_truth_gain = [(doc_sentences[sent_idx], gain_val) for sent_idx, gain_val in ground_truth_gain]
pred_gain = [(doc_sentences[sent_idx], gain_val) for sent_idx, gain_val in pred_gain]
out.append(RankedGains(ground_truth_gain, pred_gain, k=real_k, ncg=ncg_score))
else:
out.append(ncg_score)
if debug:
return statistics.mean([ele.ncg for ele in out]), out
return statistics.mean(out), out