File size: 27,630 Bytes
668c6f3
 
 
 
 
 
 
 
 
 
 
 
 
c2e3dae
 
 
 
668c6f3
a54024a
27a1559
 
668c6f3
27a1559
 
a54024a
 
 
 
 
 
 
27a1559
 
668c6f3
 
a54024a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668c6f3
 
 
 
a54024a
 
 
 
 
668c6f3
 
 
a54024a
 
 
 
668c6f3
a54024a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668c6f3
a54024a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668c6f3
 
a54024a
 
 
 
 
 
 
668c6f3
 
a54024a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27a1559
a54024a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0e4e28
 
 
 
a54024a
 
 
 
e0e4e28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668c6f3
 
 
27a1559
a54024a
 
 
 
 
 
 
 
 
 
 
668c6f3
 
 
 
 
 
 
 
 
 
a54024a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668c6f3
 
 
 
a54024a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27a1559
a54024a
 
 
 
 
 
 
 
 
 
 
 
27a1559
a54024a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27a1559
a54024a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Sem-NCG metric
Author: Naman Bansal
"""

import statistics
from dataclasses import dataclass
from typing import List, Tuple, Union

import datasets
import evaluate
import nltk
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

from .encoder_models import get_sbert_encoder, get_encoder
from .type_aliases import DEVICE_TYPE, NDArray, DOCUMENT_TYPE
from .utils import get_gpu, flatten_list, slice_embeddings, is_nested_list_of_type, \
    tokenize_and_prep_document

_CITATION = """\
@inproceedings{akter-etal-2022-revisiting,
    title = "Revisiting Automatic Evaluation of Extractive Summarization Task: Can We Do Better than {ROUGE}?",
    author = "Akter, Mousumi  and
      Bansal, Naman  and
      Karmaker, Shubhra Kanti",
    editor = "Muresan, Smaranda  and
      Nakov, Preslav  and
      Villavicencio, Aline",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2022",
    month = may,
    year = "2022",
    address = "Dublin, Ireland",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.findings-acl.122",
    doi = "10.18653/v1/2022.findings-acl.122",
    pages = "1547--1560",
    abstract = "It has been the norm for a long time to evaluate automated summarization tasks using the popular ROUGE metric. Although several studies in the past have highlighted the limitations of ROUGE, researchers have struggled to reach a consensus on a better alternative until today. One major limitation of the traditional ROUGE metric is the lack of semantic understanding (relies on direct overlap of n-grams). In this paper, we exclusively focus on the extractive summarization task and propose a semantic-aware nCG (normalized cumulative gain)-based evaluation metric (called Sem-nCG) for evaluating this task. One fundamental contribution of the paper is that it demonstrates how we can generate more reliable semantic-aware ground truths for evaluating extractive summarization tasks without any additional human intervention. To the best of our knowledge, this work is the first of its kind. We have conducted extensive experiments with this new metric using the widely used CNN/DailyMail dataset. Experimental results show that the new Sem-nCG metric is indeed semantic-aware, shows higher correlation with human judgement (more reliable) and yields a large number of disagreements with the original ROUGE metric (suggesting that ROUGE often leads to inaccurate conclusions also verified by humans).",
}
"""

_DESCRIPTION = """\
Sem-nCG (Semantic Normalized Cumulative Gain) Metric evaluates the quality of predicted sentences 
(abstractive/extractive) in relation to reference sentences and documents using Semantic Normalized Cumulative Gain 
(NCG). It computes gain values and NCG scores based on cosine similarity between sentence embeddings, leveraging a 
Sentence-BERT encoder. This metric is designed to assess the relevance and ranking of predicted sentences, making it 
useful for tasks such as summarization and information retrieval.
"""

_KWARGS_DESCRIPTION = """
Sem-nCG (Semantic Normalized Cumulative Gain) compares the system-generated summaries (predictions) with ground truth 
reference summaries (references) and input documents (documents) using Semantic Normalized Cumulative Gain (NCG). 
It computes gain values and NCG scores based on sentence embeddings.

Args:
    predictions (DOCUMENT_TYPE): The predicted sentences. 
                                 `tokenize_sentences`=True -> predictions: List[str]
                                 `tokenize_sentences`=False -> predictions: List[List[str]]
    references (DOCUMENT_TYPE): The reference sentences.
                                `tokenize_sentences`=True -> references: List[str]
                                `tokenize_sentences`=False -> references: List[List[str]]
    documents (DOCUMENT_TYPE): Input documents.
                               `tokenize_sentences`=True -> documents: List[str]
                               `tokenize_sentences`=False -> documents: List[List[str]]
    k (int): The rank threshold used for evaluating gains (typically top-k sentences). Default is 3.
    gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation.
        bool - 
            False - CPU (Default)
            True - GPU (device 0) if gpu is available else CPU
        int - 
            n - GPU, device index n
        str - 
            'cuda', 'gpu', 'cpu' 
        List[Union[str, int]] - Multiple GPUs/cpus i.e. use multiple processes when computing embeddings
    batch_size (int): Batch size for encoding. Default is 32.
    verbose (bool): Flag to indicate verbose output. Default is False.
    tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
    pre_compute_embeddings (bool): Flag to indicate whether to pre-compute embeddings for all sentences. This speeds up 
                                   computation but requires more memory. Default is False.
    debug (bool): Flag to return detailed debug information including ranked gains. Default is False.

Returns:
    Union[Tuple[float, List[float]], Tuple[float, List[RankedGains]]]:
    If `debug` is False, returns a tuple containing the mean SemnCG score and a list of SemnCG scores for each document.
    If `debug` is True, returns a tuple containing the mean SemnCG score and a list of `RankedGains` objects with 
    detailed gain information for each document.

Examples of input formats:

Case 1: tokenize_sentences = True
    predictions: List[str] - List of predictions where each prediction is a document.
    references: List[str] - List of references where each reference is a document.
    documents: List[str] - List of input documents where each document is a document.
    Example:
        predictions = ["This is a prediction sentence 1. This is a prediction sentence 2."]
        references = ["This is a reference sentence 1. This is a reference sentence 2."]
        documents = ["This is a document sentence 1. This is a document sentence 2."]

Case 2: tokenize_sentences = False
    predictions: List[List[str]] - List of predictions where each prediction is a list of sentences.
    references: List[List[str]] - List of references where each reference is a list of sentences.
    documents: List[List[str]] - List of input documents where each document is a list of sentences.
    Example:
        predictions = [["This is a prediction sentence 1.", "This is a prediction sentence 2."]]
        references = [["This is a reference sentence 1.", "This is a reference sentence 2."]]
        documents = [["This is a document sentence 1.", "This is a document sentence 2."]]

Examples:

    >>> import evaluate
    >>> predictions = ["This is a prediction sentence 1. This is a prediction sentence 2."]
    >>> references = ["This is a reference sentence 1. This is a reference sentence 2."]
    >>> documents = ["This is a document sentence 1. This is a document sentence 2."]
    >>> metric = evaluate.load("nbansal/semncg", model_name="all-MiniLM-L6-v2")  
    >>> mean_score, scores = metric.compute(predictions=predictions, references=references, documents=documents)
    >>> print(f"Mean SemnCG: {mean_score}")
"""


@dataclass
class RankedGains:
    """
   Dataclass to store ranked gains and associated metadata.

   Attributes:
       gt_gains (List[Tuple[str, float]]): List of tuples representing ground truth (ideal) gains,
           where each tuple contains a document sentence and its corresponding gain value.
       pred_gains (List[Tuple[str, float]]): List of tuples representing predicted gains by the model,
           where each tuple contains a document identifier and its corresponding gain value.
       k (int): The rank threshold used for evaluating gains (typically top-k documents).
       ncg (float): Normalized Cumulative Gain (NCG) score calculated based on the predicted gains
           compared to the ground truth gains.

   Notes:
       - `gt_gains` and `pred_gains` are typically sorted in descending order
       - `k` specifies the top-k threshold used for evaluating the gains.
       - `ncg` provides a normalized measure of the model's performance.
   """
    gt_gains: List[Tuple[str, float]]
    pred_gains: List[Tuple[str, float]]
    k: int
    ncg: float


def compute_cosine_similarity(doc_embeds: NDArray, ref_embeds: NDArray) -> List[float]:
    """
   Compute cosine similarity scores between each document embedding and reference embeddings.

   Args:
       doc_embeds (NDArray): 2D array of shape (#Docs, Embedding_dim) containing document embeddings.
       ref_embeds (NDArray): 2D array of shape (#Refs, Embedding_dim) containing reference embeddings.

   Returns:
       List[float]: A list of mean cosine similarity scores between each document and reference embeddings.
                    The length of the list is equal to the number of documents (#Docs).

   Notes:
       - Uses cosine_similarity function from sklearn.metrics.pairwise to compute pairwise cosine similarities.
       - Returns the mean cosine similarity scores across reference embeddings for each document embedding.
   """
    # Compute cosine similarity between predicted and reference embeddings
    cosine_scores = cosine_similarity(doc_embeds, ref_embeds)  # [#Docs, #Refs]
    return np.mean(cosine_scores, axis=1).tolist()


def compute_gain(sim_scores: List[float]) -> List[Tuple[int, float]]:
    """
    Compute gain values for ranked similarity scores.

    Args:
        sim_scores (List[float]): List of similarity scores for documents (`compute_cosine_similarity(doc_embeds, ref_embeds)`)

    Returns:
        List[Tuple[int, float]]: A list of tuples where each tuple contains a document index and its corresponding gain 
                                 value. The list is sorted by descending order of gain values.

    Notes:
        - Computes gain values based on the rank order of similarity scores, where higher scores indicate higher gains.
        - Uses the formula: gain = rank_position / sum of ranks, where rank_position starts from 1 for the highest score
        - Returns a list sorted by descending gain values.
    """
    count = len(sim_scores)
    sim_scores = np.array(sim_scores).argsort()[::-1]  # Reverse Sorted Order of doc sentence indices
    denominator = count * (count + 1) / 2  # (n * (n+1))/2
    return [(s_idx, val / denominator) for s_idx, val in zip(sim_scores, range(count, 0, -1))]


def score_ncg(model_relevance: List[float], gt_relevance: List[float]) -> float:
    """
    Calculate the Normalized Cumulative Gain (NCG) score based on model relevance and ground truth relevance.

    Args:
        model_relevance (List[float]): List of gain values representing the relevance scores predicted by the model.
        gt_relevance (List[float]): List of gain values representing the ground truth (ideal) relevance scores.

    Returns:
        float: Normalized Cumulative Gain (NCG) score, which measures the effectiveness of the model's relevance
               predictions compared to the ideal relevance scores. The score ranges from 0 to 1, where higher values
               indicate better performance.

    Notes:
        - Calculates Cumulative Gain (CG) for both model and ground truth relevance lists.
        - Normalizes CG scores by dividing model CG by ground truth CG to get the NCG score.
        - Returns 0 if the ground truth CG (icg) is 0 to avoid division by zero.
    """

    # CG score
    cg = sum(model_relevance)

    # ICG score
    icg = sum(gt_relevance)

    # Normalized CG score
    return cg / icg if icg != 0 else 0


def compute_ncg(pred_gains: List[Tuple[int, float]], gt_gains: List[Tuple[int, float]], k: int) -> float:
    """
    Compute the Normalized Cumulative Gain (NCG) score based on predicted and ground truth gains up to rank k.

    Args:
       pred_gains (List[Tuple[int, float]]): List of tuples representing predicted gains by the model,
           where each tuple contains a document position (or index) and its corresponding gain value. 
           (Sorted in Descending Order)
       gt_gains (List[Tuple[int, float]]): List of tuples representing ground truth gains (ideal gains),
           where each tuple contains a document position (or index) and its corresponding gain value. 
           (Sorted in Descending Order)
       k (int): The rank threshold used for evaluating gains (typically top-k documents).

    Returns:
       float: Normalized Cumulative Gain (NCG) score based on the predicted gains compared to the ground truth gains.

    Notes:
       - Both `pred_gains` and `gt_gains` should be sorted lists (in descending order) where higher gain values indicate
        higher relevance.
       - The function calculates NCG up to rank `k`, considering only the top-k documents.
       - Uses the `score_ncg` function to compute the NCG score based on the model's predicted gains and the ground
        truth.
    """
    gt_dict = dict(gt_gains)
    gt_rel = [v for _, v in gt_gains[:k]]
    model_rel = [gt_dict[position] for position, _ in pred_gains[:k]]
    return score_ncg(model_rel, gt_rel)


def _validate_input_format(
        tokenize_sentences: bool,
        predictions: DOCUMENT_TYPE,
        references: DOCUMENT_TYPE,
        documents: DOCUMENT_TYPE
):
    """
    Validate the format of predictions, references, and documents based on specified criteria.

    Args:
        tokenize_sentences (bool): Flag indicating whether sentences should be tokenized.
        predictions (DOCUMENT_TYPE): Predictions to validate.
        references (DOCUMENT_TYPE): References to validate.
        documents (DOCUMENT_TYPE): Documents to validate.

    Raises:
        ValueError: If the format of predictions, references, or documents does not meet the specified criteria.

    Validation Criteria:
    The function validates predictions, references, and documents based on the following conditions:
    1. If `tokenize_sentences` is True:
       - Predictions, references, and documents must all be lists of strings (`is_list_of_strings_at_depth(obj, 1)`).

    2. If `tokenize_sentences` is False:
       - Predictions, references, and documents must all be lists of lists of strings
       (`is_list_of_strings_at_depth(obj, 2)`).

    The function checks these conditions and raises a ValueError if any condition is not met,
    indicating that predictions, references, or documents are not in the valid input format.

    Notes:
    - `DOCUMENT_TYPE`: Union[List[str], List[List[str]]]
    - Uses helper function `is_list_of_strings_at_depth` to validate the format of lists of strings.

    Example:
        >>> tokenize_sentences = True
        >>> predictions = ["This is prediction 1.", "This is prediction 2."]
        >>> references = ["Reference for prediction 1.", "Reference for prediction 2."]
        >>> documents = ["Document 1 content.", "Document 2 content."]
        >>> _validate_input_format(tokenize_sentences, predictions, references, documents)

    Example:
        >>> tokenize_sentences = False
        >>> predictions = [["Sentence 1 in prediction 1.", "Sentence 2 in prediction 1."],
        >>>                ["Sentence 1 in prediction 2.", "Sentence 2 in prediction 2."]]
        >>> references = [["Sentences in reference 1."], ["Sentences in reference 2."]]
        >>> documents = [["Sentence 1 in document 1.", "Sentence 2 in document 1."],
        >>>              ["Sentence 1 in document 2.", "Sentence 2 in document 2."]]
        >>> _validate_input_format(tokenize_sentences, predictions, references, documents)
    """
    if not (len(predictions) == len(references) == len(documents)):
        raise ValueError(
            f"Predictions, References and Documents must have the same length. "
            f"Got {len(predictions)} predictions, {len(references)} references and {len(documents)} documents."
        )

    if len(predictions) == 0:
        raise ValueError("Can't have empty inputs")

    def check_format(lst_obj, expected_depth: int, name: str):
        is_valid, error_message = is_nested_list_of_type(lst_obj, element_type=str, depth=expected_depth)
        if not is_valid:
            raise ValueError(f"{name} are not in the expected format.\n"
                             f"Error: {error_message}.")

    try:
        if tokenize_sentences:
            check_format(predictions, expected_depth=1, name="predictions")
            check_format(references, expected_depth=1, name="references")
            check_format(documents, expected_depth=1, name="documents")
        else:
            check_format(predictions, expected_depth=2, name="predictions")
            check_format(references, expected_depth=2, name="references")
            check_format(documents, expected_depth=2, name="documents")
    except ValueError as ve:
        raise ValueError(f"Input validation error: {ve}")


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class SemNCG(evaluate.Metric):
    """
    SemnCG (Semantic Normalized Cumulative Gain) Metric.

    This metric evaluates the quality of predicted sentences in relation to reference sentences and documents
    using Semantic Normalized Cumulative Gain (NCG). It computes the gain values and NCG scores based on
    cosine similarity between sentence embeddings, leveraging a Sentence-BERT encoder.
    """

    def __init__(self, model_name: str = "all-MiniLM-L6-v2", **kwargs):
        self.sbert_encoder = get_sbert_encoder(model_name)
        super().__init__(**kwargs)

    def _info(self):
        # TODO: Specifies the evaluate.EvaluationModuleInfo object
        return evaluate.MetricInfo(
            # This is the description that will appear on the modules page.
            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            features=[
                # Tokenize_Sentences = True
                datasets.Features(
                    {
                        "predictions": datasets.Value("string"),
                        "references": datasets.Value("string"),
                        "documents": datasets.Value("string"),
                    }
                ),
                # Tokenize_Sentences = False
                datasets.Features(
                    {
                        "predictions": datasets.Sequence(datasets.Value("string", id="sequence"), id="predictions"),
                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
                        "documents": datasets.Sequence(datasets.Value("string", id="sequence"), id="documents"),
                    }
                ),
            ],
            # # Homepage of the module for documentation
            # homepage="http://module.homepage",
            # # Additional links to the codebase or references
            # codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
            reference_urls=["https://aclanthology.org/2022.findings-acl.122/"]
        )

    def _download_and_prepare(self, dl_manager):
        """Optional: download external resources useful to compute the scores"""
        nltk.download("punkt", quiet=True)

    def _compute(
            self,
            predictions: DOCUMENT_TYPE,
            references: DOCUMENT_TYPE,
            documents: DOCUMENT_TYPE,
            k: int = 3,
            gpu: DEVICE_TYPE = False,
            verbose: bool = False,
            batch_size: int = 32,
            tokenize_sentences: bool = True,
            pre_compute_embeddings: bool = False,
            debug: bool = False,
    ) -> Union[Tuple[float, List[float]], Tuple[float, List[RankedGains]]]:
        """
        Compute the Semantic Normalized Cumulative Gain (SemnCG) score.

        Args:
            predictions (DOCUMENT_TYPE): The predicted sentences. 
                                         `tokenize_sentences`=True -> predictions: List[str]
                                         `tokenize_sentences`=False -> predictions: List[List[str]]
            references (DOCUMENT_TYPE): The reference sentences.
                                        `tokenize_sentences`=True -> references: List[str]
                                        `tokenize_sentences`=False -> references: List[List[str]]
            documents (DOCUMENT_TYPE): Input documents.
                                       `tokenize_sentences`=True -> references: List[str]
                                       `tokenize_sentences`=False -> references: List[List[str]]
            k (int, optional): The rank threshold used for evaluating gains (typically top-k sentences). Default is 3.
            gpu (DEVICE_TYPE, optional): Whether to use GPU for computation. Default is False.
            verbose (bool, optional): Whether to print verbose logs. Default is False.
            batch_size (int, optional): The batch size for encoding sentences. Default is 32.
            tokenize_sentences (bool, optional): Whether to tokenize sentences. If True, sentences are tokenized before
                                                 processing. Default is True.
            pre_compute_embeddings (bool, optional): Whether to pre-compute embeddings for all sentences. This speeds up
                                                     computation but requires more memory. Default is False.
            debug (bool, optional): Whether to return detailed debug information including ranked gains. Default=False.

        Returns:
            Union[Tuple[float, List[float]], Tuple[float, List[RankedGains]]]:
            If `debug` is False, returns a tuple containing the mean SemnCG score and a list of SemnCG scores for each document.
            If `debug` is True, returns a tuple containing the mean SemnCG score and a list of `RankedGains` objects with detailed gain information for each document.

        Raises:
            ValueError: If the format of predictions, references, or documents does not meet the specified criteria.

        Notes:
            - Validates the format of predictions, references, and documents based on `tokenize_sentences`.
            - Computes embeddings using a Sentence-BERT encoder.
            - Computes cosine similarity between document, reference, and prediction embeddings.
            - Calculates gain values and Normalized Cumulative Gain (NCG) scores.
            - Optionally returns detailed debug information for each document if `debug` is True.
        """

        # Validate inputs corresponding to flags
        _validate_input_format(tokenize_sentences, predictions, references, documents)

        # Get GPU
        device = get_gpu(gpu)
        if verbose:
            print(f"Using devices: {device}")

        # Get model
        encoder = get_encoder(self.sbert_encoder, device=device, batch_size=batch_size, verbose=verbose)

        if pre_compute_embeddings:  # fast but takes more memory
            predictions = [tokenize_and_prep_document(pred, tokenize_sentences) for pred in predictions]
            references = [tokenize_and_prep_document(ref, tokenize_sentences) for ref in references]
            documents = [tokenize_and_prep_document(doc, tokenize_sentences) for doc in documents]

            # This is only done for debug case
            sent_tokenized_documents = documents

            # Compute All Embeddings
            all_sentences = flatten_list(documents) + flatten_list(references) + flatten_list(predictions)
            embeddings = encoder.encode(all_sentences)

            prediction_sentences_count = [len(pred) for pred in predictions]
            reference_sentences_count = [len(ref) for ref in references]
            document_sentences_count = [len(doc) for doc in documents]

            # Get embeddings corresponding to documents, references and predictions (IN ORDER)
            doc_embeddings = slice_embeddings(embeddings, document_sentences_count)
            ref_embeddings = slice_embeddings(embeddings[sum(document_sentences_count):], reference_sentences_count)
            pred_embeddings = slice_embeddings(
                embeddings[sum(document_sentences_count + reference_sentences_count):], prediction_sentences_count
            )

            iterable_obj = zip(pred_embeddings, ref_embeddings, doc_embeddings)

        else:
            iterable_obj = zip(predictions, references, documents)

        out = []
        for idx, (pred, ref, doc) in enumerate(tqdm(iterable_obj)):

            if not pre_compute_embeddings:  # Compute embeddings
                ref_sentences = tokenize_and_prep_document(ref, tokenize_sentences)
                pred_sentences = tokenize_and_prep_document(pred, tokenize_sentences)
                doc_sentences = tokenize_and_prep_document(doc, tokenize_sentences)

                # Compute Embeddings
                doc_sentence_count = len(doc_sentences)
                ref_sentence_count = len(ref_sentences)
                all_sentences = doc_sentences + ref_sentences + pred_sentences
                embeddings = encoder.encode(all_sentences)
                doc_embeddings = embeddings[:doc_sentence_count]
                ref_embeddings = embeddings[doc_sentence_count:doc_sentence_count + ref_sentence_count]
                pred_embeddings = embeddings[doc_sentence_count + ref_sentence_count:]
            else:  # we already have embeddings
                doc_embeddings = doc
                ref_embeddings = ref
                pred_embeddings = pred

                doc_sentences = sent_tokenized_documents[idx]

            # Compute Pair-Wise Cosine Similarity
            ref_sim_scores = compute_cosine_similarity(doc_embeddings, ref_embeddings)
            pred_sim_scores = compute_cosine_similarity(doc_embeddings, pred_embeddings)

            # Compute Gains
            ground_truth_gain = compute_gain(ref_sim_scores)

            # this is used to compute top-predicted sentence indices
            pred_gain = compute_gain(pred_sim_scores)
            real_k = min(len(pred_gain), k)

            # Compute NCG Scores
            ncg_score = compute_ncg(pred_gain, ground_truth_gain, real_k)

            if debug:
                ground_truth_gain = [(doc_sentences[sent_idx], gain_val) for sent_idx, gain_val in ground_truth_gain]
                pred_gain = [(doc_sentences[sent_idx], gain_val) for sent_idx, gain_val in pred_gain]
                out.append(RankedGains(ground_truth_gain, pred_gain, k=real_k, ncg=ncg_score))
            else:
                out.append(ncg_score)

        if debug:
            return statistics.mean([ele.ncg for ele in out]), out

        return statistics.mean(out), out