Spaces:
Running
Running
Support other SentenceTransformer models as well and update the documentation accordingly
Browse files- README.md +11 -3
- encoder_models.py +19 -20
- semf1.py +12 -8
- tests.py +35 -24
README.md
CHANGED
@@ -53,6 +53,10 @@ Sem-F1 also accepts multiple optional arguments:
|
|
53 |
- `pv1` - [paraphrase-distilroberta-base-v1](https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1)
|
54 |
- `stsb` - [stsb-roberta-large](https://huggingface.co/sentence-transformers/stsb-roberta-large)
|
55 |
- `use` - [Universal Sentence Encoder](https://huggingface.co/sentence-transformers/use-cmlm-multilingual) (Default)
|
|
|
|
|
|
|
|
|
56 |
- `tokenize_sentences (bool)`: Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
|
57 |
- `multi_references (bool)`: Flag to indicate whether multiple references are provided. Default: False.
|
58 |
- `gpu (Union[bool, str, int, List[Union[str, int]]])`: Whether to use GPU, CPU or multiple-processes for computation.
|
@@ -79,10 +83,14 @@ List of `Scores` dataclass corresponding to each sample -
|
|
79 |
- `f1: float`: F1 score (between precision and average recall).
|
80 |
|
81 |
|
82 |
-
##
|
83 |
Currently, we have only implemented the 3 encoders* that we experimented with in our
|
84 |
-
[paper](https://aclanthology.org/2022.emnlp-main.49/).
|
85 |
-
|
|
|
|
|
|
|
|
|
86 |
|
87 |
`*` *In out paper, we used the Tensorflow [version](https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder)
|
88 |
of the USE model, however, in our current implementation, we used [PyTorch version](https://huggingface.co/sentence-transformers/use-cmlm-multilingual).*
|
|
|
53 |
- `pv1` - [paraphrase-distilroberta-base-v1](https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1)
|
54 |
- `stsb` - [stsb-roberta-large](https://huggingface.co/sentence-transformers/stsb-roberta-large)
|
55 |
- `use` - [Universal Sentence Encoder](https://huggingface.co/sentence-transformers/use-cmlm-multilingual) (Default)
|
56 |
+
|
57 |
+
Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by SentenceTransformer
|
58 |
+
such as `all-mpnet-base-v2` or `roberta-base`
|
59 |
+
|
60 |
- `tokenize_sentences (bool)`: Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
|
61 |
- `multi_references (bool)`: Flag to indicate whether multiple references are provided. Default: False.
|
62 |
- `gpu (Union[bool, str, int, List[Union[str, int]]])`: Whether to use GPU, CPU or multiple-processes for computation.
|
|
|
83 |
- `f1: float`: F1 score (between precision and average recall).
|
84 |
|
85 |
|
86 |
+
## Extensions
|
87 |
Currently, we have only implemented the 3 encoders* that we experimented with in our
|
88 |
+
[paper](https://aclanthology.org/2022.emnlp-main.49/). Furthermore, you can use any model on
|
89 |
+
Huggingface/SentenceTransformer that is supported by SentenceTransformer such as `all-mpnet-base-v2` or `roberta-base`.
|
90 |
+
|
91 |
+
If you want to use your own encoder model, either make sure that is supported by `SentenceTransformer`. Or if it's a
|
92 |
+
completely new architecture, it can easily with extended for more models by extending the `Encoder` base class (Refer to
|
93 |
+
`encoder_models.py` file).
|
94 |
|
95 |
`*` *In out paper, we used the Tensorflow [version](https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder)
|
96 |
of the USE model, however, in our current implementation, we used [PyTorch version](https://huggingface.co/sentence-transformers/use-cmlm-multilingual).*
|
encoder_models.py
CHANGED
@@ -25,14 +25,6 @@ class Encoder(abc.ABC):
|
|
25 |
raise NotImplementedError("Method 'encode' must be implemented in subclass.")
|
26 |
|
27 |
|
28 |
-
class USE(Encoder):
|
29 |
-
def __init__(self):
|
30 |
-
pass
|
31 |
-
|
32 |
-
def encode(self, prediction: List[str]) -> NDArray:
|
33 |
-
pass
|
34 |
-
|
35 |
-
|
36 |
class SBertEncoder(Encoder):
|
37 |
def __init__(self, model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool):
|
38 |
"""
|
@@ -44,7 +36,7 @@ class SBertEncoder(Encoder):
|
|
44 |
batch_size (int): Batch size for encoding.
|
45 |
verbose (bool): Whether to print verbose information during encoding.
|
46 |
"""
|
47 |
-
self.model = SentenceTransformer(model_name)
|
48 |
self.device = device
|
49 |
self.batch_size = batch_size
|
50 |
self.verbose = verbose
|
@@ -84,10 +76,13 @@ def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, v
|
|
84 |
|
85 |
Args:
|
86 |
model_name (str): Name of the model to instantiate
|
87 |
-
Options:
|
88 |
-
|
89 |
-
stsb
|
90 |
-
use
|
|
|
|
|
|
|
91 |
device (Union[str, int, List[Union[str, int]]): Device specification for the encoder
|
92 |
(e.g., "cuda", 0 for GPU, "cpu").
|
93 |
batch_size (int): Batch size for encoding.
|
@@ -97,12 +92,16 @@ def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, v
|
|
97 |
Encoder: Instance of the selected encoder based on the model_name.
|
98 |
|
99 |
Raises:
|
100 |
-
|
101 |
"""
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
|
|
|
25 |
raise NotImplementedError("Method 'encode' must be implemented in subclass.")
|
26 |
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
class SBertEncoder(Encoder):
|
29 |
def __init__(self, model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool):
|
30 |
"""
|
|
|
36 |
batch_size (int): Batch size for encoding.
|
37 |
verbose (bool): Whether to print verbose information during encoding.
|
38 |
"""
|
39 |
+
self.model = SentenceTransformer(model_name, trust_remote_code=True)
|
40 |
self.device = device
|
41 |
self.batch_size = batch_size
|
42 |
self.verbose = verbose
|
|
|
76 |
|
77 |
Args:
|
78 |
model_name (str): Name of the model to instantiate
|
79 |
+
Options:
|
80 |
+
paraphrase-distilroberta-base-v1,
|
81 |
+
stsb-roberta-large,
|
82 |
+
sentence-transformers/use-cmlm-multilingual
|
83 |
+
Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by
|
84 |
+
SentenceTransformer.
|
85 |
+
|
86 |
device (Union[str, int, List[Union[str, int]]): Device specification for the encoder
|
87 |
(e.g., "cuda", 0 for GPU, "cpu").
|
88 |
batch_size (int): Batch size for encoding.
|
|
|
92 |
Encoder: Instance of the selected encoder based on the model_name.
|
93 |
|
94 |
Raises:
|
95 |
+
EnvironmentError/RuntimeError: If an unsupported model_name is provided.
|
96 |
"""
|
97 |
|
98 |
+
try:
|
99 |
+
encoder = SBertEncoder(model_name, device, batch_size, verbose)
|
100 |
+
except EnvironmentError as err:
|
101 |
+
raise EnvironmentError(str(err)) from None
|
102 |
+
except Exception as err:
|
103 |
+
raise RuntimeError(str(err)) from None
|
104 |
+
|
105 |
+
return encoder
|
106 |
+
|
107 |
+
|
semf1.py
CHANGED
@@ -62,9 +62,12 @@ Args:
|
|
62 |
predictions (list): List of predictions. Format varies based on `tokenize_sentences` and `multi_references` flags.
|
63 |
references (list): List of references. Format varies based on `tokenize_sentences` and `multi_references` flags.
|
64 |
model_type (str): Model to use for encoding sentences. Options: ['pv1', 'stsb', 'use']
|
65 |
-
pv1 - paraphrase-distilroberta-base-v1
|
66 |
stsb - stsb-roberta-large
|
67 |
-
use - Universal Sentence Encoder
|
|
|
|
|
|
|
68 |
tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
|
69 |
multi_references (bool): Flag to indicate whether multiple references are provided. Default is False.
|
70 |
gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation.
|
@@ -241,7 +244,7 @@ class SemF1(evaluate.Metric):
|
|
241 |
_MODEL_TYPE_TO_NAME = {
|
242 |
"pv1": "paraphrase-distilroberta-base-v1",
|
243 |
"stsb": "stsb-roberta-large",
|
244 |
-
"use": "
|
245 |
}
|
246 |
|
247 |
def _info(self):
|
@@ -304,9 +307,7 @@ class SemF1(evaluate.Metric):
|
|
304 |
model_type = "use"
|
305 |
|
306 |
if model_type not in self._MODEL_TYPE_TO_NAME.keys():
|
307 |
-
|
308 |
-
f"Options: {self._MODEL_TYPE_TO_NAME.keys()}\n"
|
309 |
-
f"Currently provided: {model_type}")
|
310 |
|
311 |
return self._MODEL_TYPE_TO_NAME[model_type]
|
312 |
|
@@ -335,9 +336,12 @@ class SemF1(evaluate.Metric):
|
|
335 |
:param references
|
336 |
:param model_type: Type of model to use for encoding.
|
337 |
Options: [pv1, stsb, use]
|
338 |
-
pv1 - paraphrase-distilroberta-base-v1
|
339 |
stsb - stsb-roberta-large
|
340 |
-
use - Universal Sentence Encoder
|
|
|
|
|
|
|
341 |
:param tokenize_sentences: Flag to sentence tokenize the document.
|
342 |
:param multi_references: Flag to indicate multiple references.
|
343 |
:param gpu: GPU device to use.
|
|
|
62 |
predictions (list): List of predictions. Format varies based on `tokenize_sentences` and `multi_references` flags.
|
63 |
references (list): List of references. Format varies based on `tokenize_sentences` and `multi_references` flags.
|
64 |
model_type (str): Model to use for encoding sentences. Options: ['pv1', 'stsb', 'use']
|
65 |
+
pv1 - paraphrase-distilroberta-base-v1
|
66 |
stsb - stsb-roberta-large
|
67 |
+
use - Universal Sentence Encoder (Default)
|
68 |
+
Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by SentenceTransformer such
|
69 |
+
as `all-mpnet-base-v2` or `roberta-base`
|
70 |
+
|
71 |
tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
|
72 |
multi_references (bool): Flag to indicate whether multiple references are provided. Default is False.
|
73 |
gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation.
|
|
|
244 |
_MODEL_TYPE_TO_NAME = {
|
245 |
"pv1": "paraphrase-distilroberta-base-v1",
|
246 |
"stsb": "stsb-roberta-large",
|
247 |
+
"use": "sentence-transformers/use-cmlm-multilingual",
|
248 |
}
|
249 |
|
250 |
def _info(self):
|
|
|
307 |
model_type = "use"
|
308 |
|
309 |
if model_type not in self._MODEL_TYPE_TO_NAME.keys():
|
310 |
+
return model_type
|
|
|
|
|
311 |
|
312 |
return self._MODEL_TYPE_TO_NAME[model_type]
|
313 |
|
|
|
336 |
:param references
|
337 |
:param model_type: Type of model to use for encoding.
|
338 |
Options: [pv1, stsb, use]
|
339 |
+
pv1 - paraphrase-distilroberta-base-v1
|
340 |
stsb - stsb-roberta-large
|
341 |
+
use - Universal Sentence Encoder (Default)
|
342 |
+
Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by
|
343 |
+
SentenceTransformer.
|
344 |
+
|
345 |
:param tokenize_sentences: Flag to sentence tokenize the document.
|
346 |
:param multi_references: Flag to indicate multiple references.
|
347 |
:param gpu: GPU device to use.
|
tests.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import statistics
|
2 |
import unittest
|
|
|
3 |
|
4 |
import numpy as np
|
5 |
import torch
|
@@ -153,32 +154,42 @@ class TestSBertEncoder(unittest.TestCase):
|
|
153 |
|
154 |
|
155 |
class TestGetEncoder(unittest.TestCase):
|
156 |
-
def
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
verbose = False
|
161 |
|
162 |
-
|
|
|
|
|
|
|
163 |
self.assertIsInstance(encoder, SBertEncoder)
|
164 |
-
self.assertEqual(encoder.device, device)
|
165 |
-
self.assertEqual(encoder.batch_size, batch_size)
|
166 |
-
self.assertEqual(encoder.verbose, verbose)
|
167 |
-
|
168 |
-
def
|
169 |
-
model_name = "
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
|
184 |
class TestSemF1(unittest.TestCase):
|
|
|
1 |
import statistics
|
2 |
import unittest
|
3 |
+
from unittest.mock import patch, MagicMock
|
4 |
|
5 |
import numpy as np
|
6 |
import torch
|
|
|
154 |
|
155 |
|
156 |
class TestGetEncoder(unittest.TestCase):
|
157 |
+
def setUp(self):
|
158 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
159 |
+
self.batch_size = 8
|
160 |
+
self.verbose = False
|
|
|
161 |
|
162 |
+
def _base_test(self, model_name):
|
163 |
+
encoder = get_encoder(model_name, self.device, self.batch_size, self.verbose)
|
164 |
+
|
165 |
+
# Assert
|
166 |
self.assertIsInstance(encoder, SBertEncoder)
|
167 |
+
self.assertEqual(encoder.device, self.device)
|
168 |
+
self.assertEqual(encoder.batch_size, self.batch_size)
|
169 |
+
self.assertEqual(encoder.verbose, self.verbose)
|
170 |
+
|
171 |
+
def test_get_sbert_encoder(self):
|
172 |
+
model_name = "stsb-roberta-large"
|
173 |
+
self._base_test(model_name)
|
174 |
+
|
175 |
+
def test_sbert_model(self):
|
176 |
+
model_name = "all-mpnet-base-v2"
|
177 |
+
self._base_test(model_name)
|
178 |
+
|
179 |
+
def test_huggingface_model(self):
|
180 |
+
"""Test Huggingface models which work with SBert library"""
|
181 |
+
model_name = "roberta-base"
|
182 |
+
self._base_test(model_name)
|
183 |
+
|
184 |
+
def test_get_encoder_environment_error(self): # This parameter is used when using patch decorator
|
185 |
+
model_name = "abc" # Wrong model_name
|
186 |
+
with self.assertRaises(EnvironmentError):
|
187 |
+
get_encoder(model_name, self.device, self.batch_size, self.verbose)
|
188 |
+
|
189 |
+
def test_get_encoder_other_exception(self):
|
190 |
+
model_name = "apple/OpenELM-270M" # This model is not supported by SentenceTransformer lib
|
191 |
+
with self.assertRaises(RuntimeError):
|
192 |
+
get_encoder(model_name, self.device, self.batch_size, self.verbose)
|
193 |
|
194 |
|
195 |
class TestSemF1(unittest.TestCase):
|