Spaces:

BridgeAI-Lab
/

Sem-nCG

Running

App Files Files Community

Sem-nCG / semncg.py

nbansal

Added author

c2e3dae 4 months ago

raw

history blame

27.3 kB

	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Sem-NCG metric
	Author: Naman Bansal
	"""

	import statistics
	from dataclasses import dataclass
	from typing import List, Tuple, Union

	import datasets
	import evaluate
	import nltk
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity
	from tqdm import tqdm

	from .encoder_models import get_sbert_encoder, get_encoder
	from .type_aliases import DEVICE_TYPE, NDArray, DOCUMENT_TYPE
	from .utils import get_gpu, flatten_list, slice_embeddings, is_nested_list_of_type, \
	tokenize_and_prep_document

	_CITATION = """\
	@inproceedings{akter-etal-2022-revisiting,
	title = "Revisiting Automatic Evaluation of Extractive Summarization Task: Can We Do Better than {ROUGE}?",
	author = "Akter, Mousumi and
	Bansal, Naman and
	Karmaker, Shubhra Kanti",
	editor = "Muresan, Smaranda and
	Nakov, Preslav and
	Villavicencio, Aline",
	booktitle = "Findings of the Association for Computational Linguistics: ACL 2022",
	month = may,
	year = "2022",
	address = "Dublin, Ireland",
	publisher = "Association for Computational Linguistics",
	url = "https://aclanthology.org/2022.findings-acl.122",
	doi = "10.18653/v1/2022.findings-acl.122",
	pages = "1547--1560",
	abstract = "It has been the norm for a long time to evaluate automated summarization tasks using the popular ROUGE metric. Although several studies in the past have highlighted the limitations of ROUGE, researchers have struggled to reach a consensus on a better alternative until today. One major limitation of the traditional ROUGE metric is the lack of semantic understanding (relies on direct overlap of n-grams). In this paper, we exclusively focus on the extractive summarization task and propose a semantic-aware nCG (normalized cumulative gain)-based evaluation metric (called Sem-nCG) for evaluating this task. One fundamental contribution of the paper is that it demonstrates how we can generate more reliable semantic-aware ground truths for evaluating extractive summarization tasks without any additional human intervention. To the best of our knowledge, this work is the first of its kind. We have conducted extensive experiments with this new metric using the widely used CNN/DailyMail dataset. Experimental results show that the new Sem-nCG metric is indeed semantic-aware, shows higher correlation with human judgement (more reliable) and yields a large number of disagreements with the original ROUGE metric (suggesting that ROUGE often leads to inaccurate conclusions also verified by humans).",
	}
	"""

	_DESCRIPTION = """\
	Sem-nCG (Semantic Normalized Cumulative Gain) Metric evaluates the quality of predicted sentences
	(abstractive/extractive) in relation to reference sentences and documents using Semantic Normalized Cumulative Gain
	(NCG). It computes gain values and NCG scores based on cosine similarity between sentence embeddings, leveraging a
	Sentence-BERT encoder. This metric is designed to assess the relevance and ranking of predicted sentences, making it
	useful for tasks such as summarization and information retrieval.
	"""

	_KWARGS_DESCRIPTION = """
	Sem-nCG (Semantic Normalized Cumulative Gain) compares the system-generated summaries (predictions) with ground truth
	reference summaries (references) and input documents (documents) using Semantic Normalized Cumulative Gain (NCG).
	It computes gain values and NCG scores based on sentence embeddings.

	Args:
	predictions (DOCUMENT_TYPE): The predicted sentences.
	`tokenize_sentences`=True -> predictions: List[str]
	`tokenize_sentences`=False -> predictions: List[List[str]]
	references (DOCUMENT_TYPE): The reference sentences.
	`tokenize_sentences`=True -> references: List[str]
	`tokenize_sentences`=False -> references: List[List[str]]
	documents (DOCUMENT_TYPE): Input documents.
	`tokenize_sentences`=True -> documents: List[str]
	`tokenize_sentences`=False -> documents: List[List[str]]
	k (int): The rank threshold used for evaluating gains (typically top-k sentences). Default is 3.
	gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation.
	bool -
	False - CPU (Default)
	True - GPU (device 0) if gpu is available else CPU
	int -
	n - GPU, device index n
	str -
	'cuda', 'gpu', 'cpu'
	List[Union[str, int]] - Multiple GPUs/cpus i.e. use multiple processes when computing embeddings
	batch_size (int): Batch size for encoding. Default is 32.
	verbose (bool): Flag to indicate verbose output. Default is False.
	tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
	pre_compute_embeddings (bool): Flag to indicate whether to pre-compute embeddings for all sentences. This speeds up
	computation but requires more memory. Default is False.
	debug (bool): Flag to return detailed debug information including ranked gains. Default is False.

	Returns:
	Union[Tuple[float, List[float]], Tuple[float, List[RankedGains]]]:
	If `debug` is False, returns a tuple containing the mean SemnCG score and a list of SemnCG scores for each document.
	If `debug` is True, returns a tuple containing the mean SemnCG score and a list of `RankedGains` objects with
	detailed gain information for each document.

	Examples of input formats:

	Case 1: tokenize_sentences = True
	predictions: List[str] - List of predictions where each prediction is a document.
	references: List[str] - List of references where each reference is a document.
	documents: List[str] - List of input documents where each document is a document.
	Example:
	predictions = ["This is a prediction sentence 1. This is a prediction sentence 2."]
	references = ["This is a reference sentence 1. This is a reference sentence 2."]
	documents = ["This is a document sentence 1. This is a document sentence 2."]

	Case 2: tokenize_sentences = False
	predictions: List[List[str]] - List of predictions where each prediction is a list of sentences.
	references: List[List[str]] - List of references where each reference is a list of sentences.
	documents: List[List[str]] - List of input documents where each document is a list of sentences.
	Example:
	predictions = [["This is a prediction sentence 1.", "This is a prediction sentence 2."]]
	references = [["This is a reference sentence 1.", "This is a reference sentence 2."]]
	documents = [["This is a document sentence 1.", "This is a document sentence 2."]]

	Examples:

	>>> import evaluate
	>>> predictions = ["This is a prediction sentence 1. This is a prediction sentence 2."]
	>>> references = ["This is a reference sentence 1. This is a reference sentence 2."]
	>>> documents = ["This is a document sentence 1. This is a document sentence 2."]
	>>> metric = evaluate.load("nbansal/semncg", model_name="all-MiniLM-L6-v2")
	>>> mean_score, scores = metric.compute(predictions=predictions, references=references, documents=documents)
	>>> print(f"Mean SemnCG: {mean_score}")
	"""


	@dataclass
	class RankedGains:
	"""
	Dataclass to store ranked gains and associated metadata.

	Attributes:
	gt_gains (List[Tuple[str, float]]): List of tuples representing ground truth (ideal) gains,
	where each tuple contains a document sentence and its corresponding gain value.
	pred_gains (List[Tuple[str, float]]): List of tuples representing predicted gains by the model,
	where each tuple contains a document identifier and its corresponding gain value.
	k (int): The rank threshold used for evaluating gains (typically top-k documents).
	ncg (float): Normalized Cumulative Gain (NCG) score calculated based on the predicted gains
	compared to the ground truth gains.

	Notes:
	- `gt_gains` and `pred_gains` are typically sorted in descending order
	- `k` specifies the top-k threshold used for evaluating the gains.
	- `ncg` provides a normalized measure of the model's performance.
	"""
	gt_gains: List[Tuple[str, float]]
	pred_gains: List[Tuple[str, float]]
	k: int
	ncg: float


	def compute_cosine_similarity(doc_embeds: NDArray, ref_embeds: NDArray) -> List[float]:
	"""
	Compute cosine similarity scores between each document embedding and reference embeddings.

	Args:
	doc_embeds (NDArray): 2D array of shape (#Docs, Embedding_dim) containing document embeddings.
	ref_embeds (NDArray): 2D array of shape (#Refs, Embedding_dim) containing reference embeddings.

	Returns:
	List[float]: A list of mean cosine similarity scores between each document and reference embeddings.
	The length of the list is equal to the number of documents (#Docs).

	Notes:
	- Uses cosine_similarity function from sklearn.metrics.pairwise to compute pairwise cosine similarities.
	- Returns the mean cosine similarity scores across reference embeddings for each document embedding.
	"""
	# Compute cosine similarity between predicted and reference embeddings
	cosine_scores = cosine_similarity(doc_embeds, ref_embeds) # [#Docs, #Refs]
	return np.mean(cosine_scores, axis=1).tolist()


	def compute_gain(sim_scores: List[float]) -> List[Tuple[int, float]]:
	"""
	Compute gain values for ranked similarity scores.

	Args:
	sim_scores (List[float]): List of similarity scores for documents (`compute_cosine_similarity(doc_embeds, ref_embeds)`)

	Returns:
	List[Tuple[int, float]]: A list of tuples where each tuple contains a document index and its corresponding gain
	value. The list is sorted by descending order of gain values.

	Notes:
	- Computes gain values based on the rank order of similarity scores, where higher scores indicate higher gains.
	- Uses the formula: gain = rank_position / sum of ranks, where rank_position starts from 1 for the highest score
	- Returns a list sorted by descending gain values.
	"""
	count = len(sim_scores)
	sim_scores = np.array(sim_scores).argsort()[::-1] # Reverse Sorted Order of doc sentence indices
	denominator = count * (count + 1) / 2 # (n * (n+1))/2
	return [(s_idx, val / denominator) for s_idx, val in zip(sim_scores, range(count, 0, -1))]


	def score_ncg(model_relevance: List[float], gt_relevance: List[float]) -> float:
	"""
	Calculate the Normalized Cumulative Gain (NCG) score based on model relevance and ground truth relevance.

	Args:
	model_relevance (List[float]): List of gain values representing the relevance scores predicted by the model.
	gt_relevance (List[float]): List of gain values representing the ground truth (ideal) relevance scores.

	Returns:
	float: Normalized Cumulative Gain (NCG) score, which measures the effectiveness of the model's relevance
	predictions compared to the ideal relevance scores. The score ranges from 0 to 1, where higher values
	indicate better performance.

	Notes:
	- Calculates Cumulative Gain (CG) for both model and ground truth relevance lists.
	- Normalizes CG scores by dividing model CG by ground truth CG to get the NCG score.
	- Returns 0 if the ground truth CG (icg) is 0 to avoid division by zero.
	"""

	# CG score
	cg = sum(model_relevance)

	# ICG score
	icg = sum(gt_relevance)

	# Normalized CG score
	return cg / icg if icg != 0 else 0


	def compute_ncg(pred_gains: List[Tuple[int, float]], gt_gains: List[Tuple[int, float]], k: int) -> float:
	"""
	Compute the Normalized Cumulative Gain (NCG) score based on predicted and ground truth gains up to rank k.

	Args:
	pred_gains (List[Tuple[int, float]]): List of tuples representing predicted gains by the model,
	where each tuple contains a document position (or index) and its corresponding gain value.
	(Sorted in Descending Order)
	gt_gains (List[Tuple[int, float]]): List of tuples representing ground truth gains (ideal gains),
	where each tuple contains a document position (or index) and its corresponding gain value.
	(Sorted in Descending Order)
	k (int): The rank threshold used for evaluating gains (typically top-k documents).

	Returns:
	float: Normalized Cumulative Gain (NCG) score based on the predicted gains compared to the ground truth gains.

	Notes:
	- Both `pred_gains` and `gt_gains` should be sorted lists (in descending order) where higher gain values indicate
	higher relevance.
	- The function calculates NCG up to rank `k`, considering only the top-k documents.
	- Uses the `score_ncg` function to compute the NCG score based on the model's predicted gains and the ground
	truth.
	"""
	gt_dict = dict(gt_gains)
	gt_rel = [v for _, v in gt_gains[:k]]
	model_rel = [gt_dict[position] for position, _ in pred_gains[:k]]
	return score_ncg(model_rel, gt_rel)


	def _validate_input_format(
	tokenize_sentences: bool,
	predictions: DOCUMENT_TYPE,
	references: DOCUMENT_TYPE,
	documents: DOCUMENT_TYPE
	):
	"""
	Validate the format of predictions, references, and documents based on specified criteria.

	Args:
	tokenize_sentences (bool): Flag indicating whether sentences should be tokenized.
	predictions (DOCUMENT_TYPE): Predictions to validate.
	references (DOCUMENT_TYPE): References to validate.
	documents (DOCUMENT_TYPE): Documents to validate.

	Raises:
	ValueError: If the format of predictions, references, or documents does not meet the specified criteria.

	Validation Criteria:
	The function validates predictions, references, and documents based on the following conditions:
	1. If `tokenize_sentences` is True:
	- Predictions, references, and documents must all be lists of strings (`is_list_of_strings_at_depth(obj, 1)`).

	2. If `tokenize_sentences` is False:
	- Predictions, references, and documents must all be lists of lists of strings
	(`is_list_of_strings_at_depth(obj, 2)`).

	The function checks these conditions and raises a ValueError if any condition is not met,
	indicating that predictions, references, or documents are not in the valid input format.

	Notes:
	- `DOCUMENT_TYPE`: Union[List[str], List[List[str]]]
	- Uses helper function `is_list_of_strings_at_depth` to validate the format of lists of strings.

	Example:
	>>> tokenize_sentences = True
	>>> predictions = ["This is prediction 1.", "This is prediction 2."]
	>>> references = ["Reference for prediction 1.", "Reference for prediction 2."]
	>>> documents = ["Document 1 content.", "Document 2 content."]
	>>> _validate_input_format(tokenize_sentences, predictions, references, documents)

	Example:
	>>> tokenize_sentences = False
	>>> predictions = [["Sentence 1 in prediction 1.", "Sentence 2 in prediction 1."],
	>>> ["Sentence 1 in prediction 2.", "Sentence 2 in prediction 2."]]
	>>> references = [["Sentences in reference 1."], ["Sentences in reference 2."]]
	>>> documents = [["Sentence 1 in document 1.", "Sentence 2 in document 1."],
	>>> ["Sentence 1 in document 2.", "Sentence 2 in document 2."]]
	>>> _validate_input_format(tokenize_sentences, predictions, references, documents)
	"""
	if not (len(predictions) == len(references) == len(documents)):
	raise ValueError("Predictions, References and Documents must have the same length.")

	if len(predictions) == 0:
	raise ValueError("Can't have empty inputs")

	def is_list_of_strings_at_depth(lst_obj, depth: int):
	return is_nested_list_of_type(lst_obj, element_type=str, depth=depth)

	if tokenize_sentences:
	condition = (
	is_list_of_strings_at_depth(predictions, 1) and
	is_list_of_strings_at_depth(references, 1) and
	is_list_of_strings_at_depth(documents, 1)
	)
	else:
	condition = (
	is_list_of_strings_at_depth(predictions, 2) and
	is_list_of_strings_at_depth(references, 2) and
	is_list_of_strings_at_depth(documents, 2)
	)

	if not condition:
	raise ValueError("Predictions, References and Documents are not valid input format. Refer to documentation.")


	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class SemNCG(evaluate.Metric):
	"""
	SemnCG (Semantic Normalized Cumulative Gain) Metric.

	This metric evaluates the quality of predicted sentences in relation to reference sentences and documents
	using Semantic Normalized Cumulative Gain (NCG). It computes the gain values and NCG scores based on
	cosine similarity between sentence embeddings, leveraging a Sentence-BERT encoder.
	"""

	def __init__(self, model_name: str = "all-MiniLM-L6-v2", **kwargs):
	self.sbert_encoder = get_sbert_encoder(model_name)
	super().__init__(**kwargs)

	def _info(self):
	# TODO: Specifies the evaluate.EvaluationModuleInfo object
	return evaluate.MetricInfo(
	# This is the description that will appear on the modules page.
	module_type="metric",
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	# This defines the format of each prediction and reference
	features=[
	# Tokenize_Sentences = True
	datasets.Features(
	{
	"predictions": datasets.Value("string"),
	"references": datasets.Value("string"),
	"documents": datasets.Value("string"),
	}
	),
	# Tokenize_Sentences = False
	datasets.Features(
	{
	"predictions": datasets.Sequence(datasets.Value("string", id="sequence"), id="predictions"),
	"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
	"documents": datasets.Sequence(datasets.Value("string", id="sequence"), id="documents"),
	}
	),
	],
	# # Homepage of the module for documentation
	# homepage="http://module.homepage",
	# # Additional links to the codebase or references
	# codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
	reference_urls=["https://aclanthology.org/2022.findings-acl.122/"]
	)

	def _download_and_prepare(self, dl_manager):
	"""Optional: download external resources useful to compute the scores"""
	nltk.download("punkt", quiet=True)

	def _compute(
	self,
	predictions: DOCUMENT_TYPE,
	references: DOCUMENT_TYPE,
	documents: DOCUMENT_TYPE,
	k: int = 3,
	gpu: DEVICE_TYPE = False,
	verbose: bool = False,
	batch_size: int = 32,
	tokenize_sentences: bool = True,
	pre_compute_embeddings: bool = False,
	debug: bool = False,
	) -> Union[Tuple[float, List[float]], Tuple[float, List[RankedGains]]]:
	"""
	Compute the Semantic Normalized Cumulative Gain (SemnCG) score.

	Args:
	predictions (DOCUMENT_TYPE): The predicted sentences.
	`tokenize_sentences`=True -> predictions: List[str]
	`tokenize_sentences`=False -> predictions: List[List[str]]
	references (DOCUMENT_TYPE): The reference sentences.
	`tokenize_sentences`=True -> references: List[str]
	`tokenize_sentences`=False -> references: List[List[str]]
	documents (DOCUMENT_TYPE): Input documents.
	`tokenize_sentences`=True -> references: List[str]
	`tokenize_sentences`=False -> references: List[List[str]]
	k (int, optional): The rank threshold used for evaluating gains (typically top-k sentences). Default is 3.
	gpu (DEVICE_TYPE, optional): Whether to use GPU for computation. Default is False.
	verbose (bool, optional): Whether to print verbose logs. Default is False.
	batch_size (int, optional): The batch size for encoding sentences. Default is 32.
	tokenize_sentences (bool, optional): Whether to tokenize sentences. If True, sentences are tokenized before
	processing. Default is True.
	pre_compute_embeddings (bool, optional): Whether to pre-compute embeddings for all sentences. This speeds up
	computation but requires more memory. Default is False.
	debug (bool, optional): Whether to return detailed debug information including ranked gains. Default=False.

	Returns:
	Union[Tuple[float, List[float]], Tuple[float, List[RankedGains]]]:
	If `debug` is False, returns a tuple containing the mean SemnCG score and a list of SemnCG scores for each document.
	If `debug` is True, returns a tuple containing the mean SemnCG score and a list of `RankedGains` objects with detailed gain information for each document.

	Raises:
	ValueError: If the format of predictions, references, or documents does not meet the specified criteria.

	Notes:
	- Validates the format of predictions, references, and documents based on `tokenize_sentences`.
	- Computes embeddings using a Sentence-BERT encoder.
	- Computes cosine similarity between document, reference, and prediction embeddings.
	- Calculates gain values and Normalized Cumulative Gain (NCG) scores.
	- Optionally returns detailed debug information for each document if `debug` is True.
	"""

	# Validate inputs corresponding to flags
	_validate_input_format(tokenize_sentences, predictions, references, documents)

	# Get GPU
	device = get_gpu(gpu)
	if verbose:
	print(f"Using devices: {device}")

	# Get model
	encoder = get_encoder(self.sbert_encoder, device=device, batch_size=batch_size, verbose=verbose)

	if pre_compute_embeddings: # fast but takes more memory
	predictions = [tokenize_and_prep_document(pred, tokenize_sentences) for pred in predictions]
	references = [tokenize_and_prep_document(ref, tokenize_sentences) for ref in references]
	documents = [tokenize_and_prep_document(doc, tokenize_sentences) for doc in documents]

	# This is only done for debug case
	sent_tokenized_documents = documents

	# Compute All Embeddings
	all_sentences = flatten_list(documents) + flatten_list(references) + flatten_list(predictions)
	embeddings = encoder.encode(all_sentences)

	prediction_sentences_count = [len(pred) for pred in predictions]
	reference_sentences_count = [len(ref) for ref in references]
	document_sentences_count = [len(doc) for doc in documents]

	# Get embeddings corresponding to documents, references and predictions (IN ORDER)
	doc_embeddings = slice_embeddings(embeddings, document_sentences_count)
	ref_embeddings = slice_embeddings(embeddings[sum(document_sentences_count):], reference_sentences_count)
	pred_embeddings = slice_embeddings(
	embeddings[sum(document_sentences_count + reference_sentences_count):], prediction_sentences_count
	)

	iterable_obj = zip(pred_embeddings, ref_embeddings, doc_embeddings)

	else:
	iterable_obj = zip(predictions, references, documents)

	out = []
	for idx, (pred, ref, doc) in enumerate(tqdm(iterable_obj)):

	if not pre_compute_embeddings: # Compute embeddings
	ref_sentences = tokenize_and_prep_document(ref, tokenize_sentences)
	pred_sentences = tokenize_and_prep_document(pred, tokenize_sentences)
	doc_sentences = tokenize_and_prep_document(doc, tokenize_sentences)

	# Compute Embeddings
	doc_sentence_count = len(doc_sentences)
	ref_sentence_count = len(ref_sentences)
	all_sentences = doc_sentences + ref_sentences + pred_sentences
	embeddings = encoder.encode(all_sentences)
	doc_embeddings = embeddings[:doc_sentence_count]
	ref_embeddings = embeddings[doc_sentence_count:doc_sentence_count + ref_sentence_count]
	pred_embeddings = embeddings[doc_sentence_count + ref_sentence_count:]
	else: # we already have embeddings
	doc_embeddings = doc
	ref_embeddings = ref
	pred_embeddings = pred

	doc_sentences = sent_tokenized_documents[idx]

	# Compute Pair-Wise Cosine Similarity
	ref_sim_scores = compute_cosine_similarity(doc_embeddings, ref_embeddings)
	pred_sim_scores = compute_cosine_similarity(doc_embeddings, pred_embeddings)

	# Compute Gains
	ground_truth_gain = compute_gain(ref_sim_scores)

	# this is used to compute top-predicted sentence indices
	pred_gain = compute_gain(pred_sim_scores)
	real_k = min(len(pred_gain), k)

	# Compute NCG Scores
	ncg_score = compute_ncg(pred_gain, ground_truth_gain, real_k)

	if debug:
	ground_truth_gain = [(doc_sentences[sent_idx], gain_val) for sent_idx, gain_val in ground_truth_gain]
	pred_gain = [(doc_sentences[sent_idx], gain_val) for sent_idx, gain_val in pred_gain]
	out.append(RankedGains(ground_truth_gain, pred_gain, k=real_k, ncg=ncg_score))
	else:
	out.append(ncg_score)

	if debug:
	return statistics.mean([ele.ncg for ele in out]), out

	return statistics.mean(out), out