Spaces:

BridgeAI-Lab
/

SemF1

Running

App Files Files Community

nbansal commited on Jun 20

Commit

de5dcb7

•

1 Parent(s): f583bc0

Refactored the code and made it faster

Browse files

Files changed (2) hide show

semf1.py +111 -116
utils.py +87 -0

semf1.py CHANGED Viewed

@@ -26,6 +26,9 @@ from numpy.typing import NDArray
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 import torch
 _CITATION = """\
 @inproceedings{bansal-etal-2022-sem,
@@ -120,6 +123,9 @@ Examples:
     [0.77, 0.56]
 """
 class Encoder(metaclass=abc.ABCMeta):
     @abc.abstractmethod
@@ -149,23 +155,12 @@ class SBertEncoder(Encoder):
 def _get_encoder(model_name: str, device: Union[str, int], batch_size: int) -> Encoder:
     if model_name == "use":
-        return SBertEncoder(model_name, device)
         # return USE()  # TODO: This will change depending on PyTorch USE VS TF USE model
     else:
         return SBertEncoder(model_name, device, batch_size)
-def _compute_f1(p, r, eps=sys.float_info.epsilon):
-    '''
-    Computes F1 value
-    :param p: Precision Value
-    :param r: Recall Value
-    :return:
-    '''
-    f1 = 2 * p * r / (p + r + eps)
-    return f1
 def _compute_cosine_similarity(pred_embeds: NDArray, ref_embeds: NDArray) -> Tuple[float, float]:
     cosine_scores = cosine_similarity(pred_embeds, ref_embeds)
     precision_per_sentence_sim = np.max(cosine_scores, axis=-1)
@@ -173,6 +168,48 @@ def _compute_cosine_similarity(pred_embeds: NDArray, ref_embeds: NDArray) -> Tup
     return np.mean(precision_per_sentence_sim).item(), np.mean(recall_per_sentence_sim).item()
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class SemF1(evaluate.Metric):
     _MODEL_TYPE_TO_NAME = {
@@ -251,7 +288,8 @@ class SemF1(evaluate.Metric):
         """Optional: download external resources useful to compute the scores"""
         import nltk
         nltk.download("punkt", quiet=True)
-        # if not nltk.data.find("tokenizers/punkt"):
     def _compute(
@@ -260,114 +298,71 @@ class SemF1(evaluate.Metric):
             references,
             model_type: Optional[str] = None,
             tokenize_sentences: bool = True,
             gpu: Union[bool, int] = False,
             batch_size: int = 32,
-    ):
-        # Ensure gpu index is within the range of total available gpus
-        gpu_available = True if torch.cuda.is_available() else False
-        if gpu_available:
-            gpu_count = torch.cuda.device_count()
-            if isinstance(gpu, int) and gpu >= gpu_count:
-                raise ValueError(
-                    f"There are {gpu_count} gpus available. Provide the correct gpu index. You provided: {gpu}"
-                )
-        # get the device
-        if gpu is False:
-            device = "cpu"
-        elif gpu is True and torch.cuda.is_available():
-            device = 0  # by default run on device 0
-        elif isinstance(gpu, int):
-            device = gpu
-        else:  # This will never happen
-            raise ValueError(f"gpu must be bool or int. Provided value: {gpu}")
-        # TODO: Also have a check on references to ensure they are also in correct format
-        # Ensure prediction documents are not already tokenized if tokenize_sentences is True
-        if not isinstance(predictions[0], str) and tokenize_sentences:
-            raise ValueError(f"Each prediction/reference should be a document i.e. when tokenize_sentences is True. "
-                             f"Currently, each prediction is of type {type(predictions[0])} ")
-        # Check single reference or multi-reference case
-        multi_references = False
-        if tokenize_sentences:
-            # references: List[List[reference]]
-            if isinstance(references[0], list) and isinstance(references[0][0], str):
-                multi_references = True
-        else:
-            # references: List[List[List[sentence]]]
-            if (
-                    isinstance(references[0], list) and
-                    isinstance(references[0][0], list) and
-                    isinstance(references[0][0][0], str)
-            ):
-                multi_references = True
         # Get the encoder model
         model_name = self._get_model_name(model_type)
-        encoder = _get_encoder(model_name, device=device)
         # Init output scores
-        precisions = [0] * len(predictions)
-        recalls = [0] * len(predictions)
-        f1_scores = [0] * len(predictions)
-        # Compute Score in case of single reference
-        if not multi_references:
-            for idx, (pred, ref) in enumerate(zip(predictions, references)):
-                # Sentence Tokenize prediction and reference
-                if tokenize_sentences:
-                    ref = nltk.tokenize.sent_tokenize(ref)  # List[str]
-                    pred = nltk.tokenize.sent_tokenize(pred)  # List[str]
-                pred_sent_count = len(pred)
-                embeddings = encoder.encode(pred + ref)
-                pred_embeddings = embeddings[:pred_sent_count]
-                ref_embeddings = embeddings[pred_sent_count:]
-                p, r = _compute_cosine_similarity(pred_embeddings, ref_embeddings)
-                f1 = _compute_f1(p, r)
-                precisions[idx] = p
-                recalls[idx] = r
-                f1_scores[idx] = f1
-        else:
-            # Compute Score in case of multiple reference
-            for idx, (pred, refs) in enumerate(zip(predictions, references)):
-                # Sentence Tokenize prediction and reference
-                if tokenize_sentences:
-                    refs = [nltk.tokenize.sent_tokenize(ref) for ref in refs]  # List[List[str]]
-                    pred = nltk.tokenize.sent_tokenize(pred)  # List[str]
-                ref_count = len(refs)
-                pred_sent_count = len(pred)
-                ref_sent_counts = [0] + [len(ref) for ref in refs]
-                cumsum_ref_sent_counts = np.cumsum(ref_sent_counts)
-                all_sentences = pred + sum(refs, [])
-                embeddings = encoder.encode(all_sentences)
-                pred_embeddings = embeddings[:pred_sent_count]
-                ref_embeddings = [
-                    embeddings[pred_sent_count + cumsum_ref_sent_counts[c_idx]:
-                               pred_sent_count + cumsum_ref_sent_counts[c_idx + 1]]
-                    for c_idx in range(ref_count)
-                ]
-                # pred_embeddings = encoder.encode(pred)
-                # ref_embeddings = [encoder.encode(refs) for ref in refs]
-                # Precision: Concatenate all the sentences in all the references
-                concat_ref_embeddings = np.concatenate(ref_embeddings, axis=0)
-                p, _ = _compute_cosine_similarity(pred_embeddings, concat_ref_embeddings)
-                # Recall: Compute individually for each reference
-                scores = [_compute_cosine_similarity(r_embeds, pred_embeddings) for r_embeds in ref_embeddings]
-                r = np.mean([r_scores for (r_scores, _) in scores]).item()
-                f1 = _compute_f1(p, r)
-                precisions[idx] = p  # TODO: check why idx says invalid type
-                recalls[idx] = r
-                f1_scores[idx] = f1
-        return {"precision": precisions, "recall": recalls, "f1": f1_scores}

 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 import torch
+from tqdm import tqdm
+from utils import is_list_of_strings_at_depth, Scores, slice_embeddings, flatten_list
 _CITATION = """\
 @inproceedings{bansal-etal-2022-sem,
     [0.77, 0.56]
 """
+_PREDICTION_TYPE = Union[List[str], List[List[str]]]
+_REFERENCE_TYPE = Union[List[str], List[List[str]], List[List[List[str]]]]
 class Encoder(metaclass=abc.ABCMeta):
     @abc.abstractmethod
 def _get_encoder(model_name: str, device: Union[str, int], batch_size: int) -> Encoder:
     if model_name == "use":
+        return SBertEncoder(model_name, device, batch_size)
         # return USE()  # TODO: This will change depending on PyTorch USE VS TF USE model
     else:
         return SBertEncoder(model_name, device, batch_size)
 def _compute_cosine_similarity(pred_embeds: NDArray, ref_embeds: NDArray) -> Tuple[float, float]:
     cosine_scores = cosine_similarity(pred_embeds, ref_embeds)
     precision_per_sentence_sim = np.max(cosine_scores, axis=-1)
     return np.mean(precision_per_sentence_sim).item(), np.mean(recall_per_sentence_sim).item()
+def _get_gpu(gpu: Union[bool, int]) -> Union[str, int]:
+    # Ensure gpu index is within the range of total available gpus
+    gpu_available = torch.cuda.is_available()
+    if gpu_available:
+        gpu_count = torch.cuda.device_count()
+        if isinstance(gpu, int) and gpu >= gpu_count:
+            raise ValueError(
+                f"There are {gpu_count} gpus available. Provide the correct gpu index. You provided: {gpu}"
+            )
+    # get the device
+    if gpu is False:
+        device = "cpu"
+    elif gpu is True and gpu_available:
+        device = 0  # by default run on device 0
+    elif isinstance(gpu, int):
+        device = gpu
+    else:  # This will never happen
+        raise ValueError(f"gpu must be bool or int. Provided value: {gpu}")
+    return device
+def _validate_input_format(
+        tokenize_sentences: bool,
+        multi_references: bool,
+        predictions: _PREDICTION_TYPE,
+        references: _REFERENCE_TYPE,
+):
+    if tokenize_sentences and multi_references:
+        condition = is_list_of_strings_at_depth(predictions, 1) and is_list_of_strings_at_depth(references, 2)
+    elif not tokenize_sentences and multi_references:
+        condition = is_list_of_strings_at_depth(predictions, 2) and is_list_of_strings_at_depth(references, 3)
+    elif tokenize_sentences and not multi_references:
+        condition = is_list_of_strings_at_depth(predictions, 1) and is_list_of_strings_at_depth(references, 1)
+    else:
+        condition = is_list_of_strings_at_depth(predictions, 2) and is_list_of_strings_at_depth(references, 2)
+    if not condition:
+        raise ValueError("Predictions are references are not valid input format. Refer to documentation.")
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class SemF1(evaluate.Metric):
     _MODEL_TYPE_TO_NAME = {
         """Optional: download external resources useful to compute the scores"""
         import nltk
         nltk.download("punkt", quiet=True)
+        # if not nltk.data.find("tokenizers/punkt"):  # TODO: check why it is not working
+        #     pass
     def _compute(
             references,
             model_type: Optional[str] = None,
             tokenize_sentences: bool = True,
+            multi_references: bool = False,
             gpu: Union[bool, int] = False,
             batch_size: int = 32,
+    ) -> List[Scores]:
+        """
+            Compute precision, recall, and F1 scores for given predictions and references.
+            :param predictions
+            :param references
+            :param model_type: Type of model to use for encoding.
+            :param tokenize_sentences: Flag to sentence tokenize the document.
+            :param multi_references: Flag to indicate multiple references.
+            :param gpu: GPU device to use.
+            :param batch_size: Batch size for encoding.
+            :return: List of Scores dataclass with precision, recall, and F1 scores.
+        """
+        # Validate inputs corresponding to flags
+        _validate_input_format(tokenize_sentences, multi_references, predictions, references)
+        # Get GPU
+        device = _get_gpu(gpu)
         # Get the encoder model
         model_name = self._get_model_name(model_type)
+        encoder = _get_encoder(model_name, device=device, batch_size=batch_size)
+        # We'll handle the single reference and multi-reference case same way. So change the data format accordingly
+        if not multi_references:
+            references = [[ref] for ref in references]
+        # Tokenize sentences if required
+        if tokenize_sentences:
+            predictions = [nltk.tokenize.sent_tokenize(pred) for pred in predictions]
+            references = [[nltk.tokenize.sent_tokenize(ref) for ref in refs] for refs in references]
+        # Flatten the data for batch processing
+        all_sentences = flatten_list(predictions) + flatten_list(references)
+        # Get num of sentences to get the corresponding embeddings
+        prediction_sentences_count = [len(pred) for pred in predictions]
+        reference_sentences_count = [[len(ref) for ref in refs] for refs in references]
+        # Note: This is the most optimal way of doing it
+        # Encode all sentences in one go
+        embeddings = encoder.encode(all_sentences)
+        # Get embeddings corresponding to predictions and references
+        pred_embeddings = slice_embeddings(embeddings, prediction_sentences_count)
+        ref_embeddings = slice_embeddings(embeddings[sum(prediction_sentences_count):], reference_sentences_count)
         # Init output scores
+        results = []
+        # Compute scores
+        for preds, refs in zip(pred_embeddings, ref_embeddings):
+            # Precision: Concatenate all the sentences in all the references
+            concat_refs = np.concatenate(refs, axis=0)
+            precision, _ = _compute_cosine_similarity(preds, concat_refs)
+            # Recall: Compute individually for each reference
+            recall_scores = [_compute_cosine_similarity(r_embeds, preds) for r_embeds in refs]
+            recall_scores = [r_scores for (r_scores, _) in recall_scores]
+            results.append(Scores(precision, recall_scores))
+        return results

utils.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from dataclasses import dataclass
+import statistics
+import sys
+from typing import List, Union
+from numpy.typing import NDArray
+NumSentencesType = Union[List[int], List[List[int]]]
+EmbeddingSlicesType = Union[List[NDArray], List[List[NDArray]]]
+def slice_embeddings(embeddings: NDArray, num_sentences: NumSentencesType) -> EmbeddingSlicesType:
+    def _slice_embeddings(s_idx: int, n_sentences: List[int]):
+        _result = []
+        for count in n_sentences:
+            _result.append(embeddings[s_idx:s_idx + count])
+            s_idx += count
+        return _result, s_idx
+    if isinstance(num_sentences, list) and all(isinstance(item, int) for item in num_sentences):
+        result, _ = _slice_embeddings(0, num_sentences)
+        return result
+    elif isinstance(num_sentences, list) and all(
+        isinstance(sublist, list) and all(
+            isinstance(item, int) for item in sublist
+        )
+        for sublist in num_sentences
+    ):
+        nested_result = []
+        start_idx = 0
+        for nested_num_sentences in num_sentences:
+            embedding_slice, start_idx = _slice_embeddings(start_idx, nested_num_sentences)
+            nested_result.append(embedding_slice)
+        return nested_result
+    else:
+        raise TypeError(f"Incorrect Type for {num_sentences=}")
+def is_list_of_strings_at_depth(obj, depth: int) -> bool:
+    if depth == 0:
+        return isinstance(obj, str)
+    elif depth > 0:
+        return isinstance(obj, list) and all(is_list_of_strings_at_depth(item, depth - 1) for item in obj)
+    else:
+        raise ValueError("Depth can't be negative")
+def flatten_list(nested_list: list) -> list:
+    """
+    Recursively flattens a nested list of any depth.
+    Parameters:
+        nested_list (list): The nested list to flatten.
+    Returns:
+        list: A flat list containing all the elements of the nested list.
+    """
+    flat_list = []
+    for item in nested_list:
+        if isinstance(item, list):
+            flat_list.extend(flatten_list(item))
+        else:
+            flat_list.append(item)
+    return flat_list
+def compute_f1(p: float, r: float, eps=sys.float_info.epsilon) -> float:
+    """
+    Computes F1 value
+    :param p: Precision Value
+    :param r: Recall Value
+    :param eps: Epsilon Value
+    :return:
+    """
+    f1 = 2 * p * r / (p + r + eps)
+    return f1
+@dataclass
+class Scores:
+    precision: float
+    recall: List[float]
+    def __post_init__(self):
+        self.f1: float = compute_f1(self.precision, statistics.fmean(self.recall))