Spaces:

BridgeAI-Lab
/

SemF1

Running

App Files Files Community

nbansal commited on Jun 26

Commit

251bfda

•

1 Parent(s): 23122f2

Support other SentenceTransformer models as well and update the documentation accordingly

Browse files

Files changed (4) hide show

README.md +11 -3
encoder_models.py +19 -20
semf1.py +12 -8
tests.py +35 -24

README.md CHANGED Viewed

@@ -53,6 +53,10 @@ Sem-F1 also accepts multiple optional arguments:
      - `pv1` - [paraphrase-distilroberta-base-v1](https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1)
      - `stsb` - [stsb-roberta-large](https://huggingface.co/sentence-transformers/stsb-roberta-large)
      - `use` - [Universal Sentence Encoder](https://huggingface.co/sentence-transformers/use-cmlm-multilingual) (Default)
    - `tokenize_sentences (bool)`: Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
    - `multi_references (bool)`: Flag to indicate whether multiple references are provided. Default: False.
    - `gpu (Union[bool, str, int, List[Union[str, int]]])`: Whether to use GPU, CPU or multiple-processes for computation.
@@ -79,10 +83,14 @@ List of `Scores` dataclass corresponding to each sample -
  - `f1: float`: F1 score (between precision and average recall).
-## Future Extensions
 Currently, we have only implemented the 3 encoders* that we experimented with in our
-[paper](https://aclanthology.org/2022.emnlp-main.49/). However, it can easily with extended for more models by simply
-extending the `Encoder` base class. (Refer to `encoder_models.py` file).
 `*` *In out paper, we used the Tensorflow [version](https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder)
 of the USE model, however, in our current implementation, we used [PyTorch version](https://huggingface.co/sentence-transformers/use-cmlm-multilingual).*

      - `pv1` - [paraphrase-distilroberta-base-v1](https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1)
      - `stsb` - [stsb-roberta-large](https://huggingface.co/sentence-transformers/stsb-roberta-large)
      - `use` - [Universal Sentence Encoder](https://huggingface.co/sentence-transformers/use-cmlm-multilingual) (Default)
+          Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by SentenceTransformer
+          such as `all-mpnet-base-v2` or `roberta-base`
    - `tokenize_sentences (bool)`: Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
    - `multi_references (bool)`: Flag to indicate whether multiple references are provided. Default: False.
    - `gpu (Union[bool, str, int, List[Union[str, int]]])`: Whether to use GPU, CPU or multiple-processes for computation.
  - `f1: float`: F1 score (between precision and average recall).
+## Extensions
 Currently, we have only implemented the 3 encoders* that we experimented with in our
+[paper](https://aclanthology.org/2022.emnlp-main.49/). Furthermore, you can use any model on
+Huggingface/SentenceTransformer that is supported by SentenceTransformer such as `all-mpnet-base-v2` or `roberta-base`.
+If you want to use your own encoder model, either make sure that is supported by `SentenceTransformer`. Or if it's a
+completely new architecture, it can easily with extended for more models by extending the `Encoder` base class (Refer to
+`encoder_models.py` file).
 `*` *In out paper, we used the Tensorflow [version](https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder)
 of the USE model, however, in our current implementation, we used [PyTorch version](https://huggingface.co/sentence-transformers/use-cmlm-multilingual).*

encoder_models.py CHANGED Viewed

@@ -25,14 +25,6 @@ class Encoder(abc.ABC):
         raise NotImplementedError("Method 'encode' must be implemented in subclass.")
-class USE(Encoder):
-    def __init__(self):
-        pass
-    def encode(self, prediction: List[str]) -> NDArray:
-        pass
 class SBertEncoder(Encoder):
     def __init__(self, model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool):
         """
@@ -44,7 +36,7 @@ class SBertEncoder(Encoder):
                 batch_size (int): Batch size for encoding.
                 verbose (bool): Whether to print verbose information during encoding.
         """
-        self.model = SentenceTransformer(model_name)
         self.device = device
         self.batch_size = batch_size
         self.verbose = verbose
@@ -84,10 +76,13 @@ def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, v
         Args:
             model_name (str): Name of the model to instantiate
-                Options: [pv1, stsb, use]
-                    pv1 - paraphrase-distilroberta-base-v1 (Default)
-                    stsb - stsb-roberta-large
-                    use - Universal Sentence Encoder
             device (Union[str, int, List[Union[str, int]]): Device specification for the encoder
                 (e.g., "cuda", 0 for GPU, "cpu").
             batch_size (int): Batch size for encoding.
@@ -97,12 +92,16 @@ def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, v
             Encoder: Instance of the selected encoder based on the model_name.
         Raises:
-            ValueError: If an unsupported model_name is provided.
         """
-    # TODO: chnage this when changing the TF model
-    if model_name == "use":
-        return SBertEncoder("sentence-transformers/use-cmlm-multilingual", device, batch_size, verbose)
-        # return USE()
-    else:
-        return SBertEncoder(model_name, device, batch_size, verbose)

         raise NotImplementedError("Method 'encode' must be implemented in subclass.")
 class SBertEncoder(Encoder):
     def __init__(self, model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool):
         """
                 batch_size (int): Batch size for encoding.
                 verbose (bool): Whether to print verbose information during encoding.
         """
+        self.model = SentenceTransformer(model_name, trust_remote_code=True)
         self.device = device
         self.batch_size = batch_size
         self.verbose = verbose
         Args:
             model_name (str): Name of the model to instantiate
+                Options:
+                    paraphrase-distilroberta-base-v1,
+                    stsb-roberta-large,
+                    sentence-transformers/use-cmlm-multilingual
+                Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by
+                SentenceTransformer.
             device (Union[str, int, List[Union[str, int]]): Device specification for the encoder
                 (e.g., "cuda", 0 for GPU, "cpu").
             batch_size (int): Batch size for encoding.
             Encoder: Instance of the selected encoder based on the model_name.
         Raises:
+            EnvironmentError/RuntimeError: If an unsupported model_name is provided.
         """
+    try:
+        encoder = SBertEncoder(model_name, device, batch_size, verbose)
+    except EnvironmentError as err:
+        raise EnvironmentError(str(err)) from None
+    except Exception as err:
+        raise RuntimeError(str(err)) from None
+    return encoder

semf1.py CHANGED Viewed

@@ -62,9 +62,12 @@ Args:
     predictions (list): List of predictions. Format varies based on `tokenize_sentences` and `multi_references` flags.
     references (list): List of references. Format varies based on `tokenize_sentences` and `multi_references` flags.
     model_type (str): Model to use for encoding sentences. Options: ['pv1', 'stsb', 'use']
-        pv1 - paraphrase-distilroberta-base-v1 (Default)
         stsb - stsb-roberta-large
-        use - Universal Sentence Encoder
     tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
     multi_references (bool): Flag to indicate whether multiple references are provided. Default is False.
     gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation.
@@ -241,7 +244,7 @@ class SemF1(evaluate.Metric):
     _MODEL_TYPE_TO_NAME = {
         "pv1": "paraphrase-distilroberta-base-v1",
         "stsb": "stsb-roberta-large",
-        "use": "use",  # "sentence-transformers/use-cmlm-multilingual",  # TODO: check PyTorch USE VS TF USE
     }
     def _info(self):
@@ -304,9 +307,7 @@ class SemF1(evaluate.Metric):
             model_type = "use"
         if model_type not in self._MODEL_TYPE_TO_NAME.keys():
-            raise ValueError(f"Provide a correct model_type.\n"
-                             f"Options: {self._MODEL_TYPE_TO_NAME.keys()}\n"
-                             f"Currently provided: {model_type}")
         return self._MODEL_TYPE_TO_NAME[model_type]
@@ -335,9 +336,12 @@ class SemF1(evaluate.Metric):
             :param references
             :param model_type: Type of model to use for encoding.
                 Options: [pv1, stsb, use]
-                    pv1 - paraphrase-distilroberta-base-v1 (Default)
                     stsb - stsb-roberta-large
-                    use - Universal Sentence Encoder
             :param tokenize_sentences: Flag to sentence tokenize the document.
             :param multi_references: Flag to indicate multiple references.
             :param gpu: GPU device to use.

     predictions (list): List of predictions. Format varies based on `tokenize_sentences` and `multi_references` flags.
     references (list): List of references. Format varies based on `tokenize_sentences` and `multi_references` flags.
     model_type (str): Model to use for encoding sentences. Options: ['pv1', 'stsb', 'use']
+        pv1 - paraphrase-distilroberta-base-v1
         stsb - stsb-roberta-large
+        use - Universal Sentence Encoder (Default)
+    Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by SentenceTransformer such
+    as `all-mpnet-base-v2` or `roberta-base`
     tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
     multi_references (bool): Flag to indicate whether multiple references are provided. Default is False.
     gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation.
     _MODEL_TYPE_TO_NAME = {
         "pv1": "paraphrase-distilroberta-base-v1",
         "stsb": "stsb-roberta-large",
+        "use": "sentence-transformers/use-cmlm-multilingual",
     }
     def _info(self):
             model_type = "use"
         if model_type not in self._MODEL_TYPE_TO_NAME.keys():
+            return model_type
         return self._MODEL_TYPE_TO_NAME[model_type]
             :param references
             :param model_type: Type of model to use for encoding.
                 Options: [pv1, stsb, use]
+                    pv1 - paraphrase-distilroberta-base-v1
                     stsb - stsb-roberta-large
+                    use - Universal Sentence Encoder (Default)
+                Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by
+                SentenceTransformer.
             :param tokenize_sentences: Flag to sentence tokenize the document.
             :param multi_references: Flag to indicate multiple references.
             :param gpu: GPU device to use.

tests.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import statistics
 import unittest
 import numpy as np
 import torch
@@ -153,32 +154,42 @@ class TestSBertEncoder(unittest.TestCase):
 class TestGetEncoder(unittest.TestCase):
-    def test_get_sbert_encoder(self):
-        model_name = "stsb-roberta-large"
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        batch_size = 8
-        verbose = False
-        encoder = get_encoder(model_name, device, batch_size, verbose)
         self.assertIsInstance(encoder, SBertEncoder)
-        self.assertEqual(encoder.device, device)
-        self.assertEqual(encoder.batch_size, batch_size)
-        self.assertEqual(encoder.verbose, verbose)
-    def test_get_use_encoder(self):
-        model_name = "use"
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        batch_size = 8
-        verbose = False
-        encoder = get_encoder(model_name, device, batch_size, verbose)
-        self.assertIsInstance(encoder, SBertEncoder)  # SBertEncoder is returned for "use" for now
-        # Uncomment below when implementing USE class
-        # self.assertIsInstance(encoder, USE)
-        # self.assertEqual(encoder.model_name, model_name)
-        # self.assertEqual(encoder.device, device)
-        # self.assertEqual(encoder.batch_size, batch_size)
-        # self.assertEqual(encoder.verbose, verbose)
 class TestSemF1(unittest.TestCase):

 import statistics
 import unittest
+from unittest.mock import patch, MagicMock
 import numpy as np
 import torch
 class TestGetEncoder(unittest.TestCase):
+    def setUp(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.batch_size = 8
+        self.verbose = False
+    def _base_test(self, model_name):
+        encoder = get_encoder(model_name, self.device, self.batch_size, self.verbose)
+        # Assert
         self.assertIsInstance(encoder, SBertEncoder)
+        self.assertEqual(encoder.device, self.device)
+        self.assertEqual(encoder.batch_size, self.batch_size)
+        self.assertEqual(encoder.verbose, self.verbose)
+    def test_get_sbert_encoder(self):
+        model_name = "stsb-roberta-large"
+        self._base_test(model_name)
+    def test_sbert_model(self):
+        model_name = "all-mpnet-base-v2"
+        self._base_test(model_name)
+    def test_huggingface_model(self):
+        """Test Huggingface models which work with SBert library"""
+        model_name = "roberta-base"
+        self._base_test(model_name)
+    def test_get_encoder_environment_error(self):  # This parameter is used when using patch decorator
+        model_name = "abc"  # Wrong model_name
+        with self.assertRaises(EnvironmentError):
+            get_encoder(model_name, self.device, self.batch_size, self.verbose)
+    def test_get_encoder_other_exception(self):
+        model_name = "apple/OpenELM-270M"  # This model is not supported by SentenceTransformer lib
+        with self.assertRaises(RuntimeError):
+            get_encoder(model_name, self.device, self.batch_size, self.verbose)
 class TestSemF1(unittest.TestCase):