nbansal commited on
Commit
de5dcb7
1 Parent(s): f583bc0

Refactored the code and made it faster

Browse files
Files changed (2) hide show
  1. semf1.py +111 -116
  2. utils.py +87 -0
semf1.py CHANGED
@@ -26,6 +26,9 @@ from numpy.typing import NDArray
26
  from sentence_transformers import SentenceTransformer
27
  from sklearn.metrics.pairwise import cosine_similarity
28
  import torch
 
 
 
29
 
30
  _CITATION = """\
31
  @inproceedings{bansal-etal-2022-sem,
@@ -120,6 +123,9 @@ Examples:
120
  [0.77, 0.56]
121
  """
122
 
 
 
 
123
 
124
  class Encoder(metaclass=abc.ABCMeta):
125
  @abc.abstractmethod
@@ -149,23 +155,12 @@ class SBertEncoder(Encoder):
149
 
150
  def _get_encoder(model_name: str, device: Union[str, int], batch_size: int) -> Encoder:
151
  if model_name == "use":
152
- return SBertEncoder(model_name, device)
153
  # return USE() # TODO: This will change depending on PyTorch USE VS TF USE model
154
  else:
155
  return SBertEncoder(model_name, device, batch_size)
156
 
157
 
158
- def _compute_f1(p, r, eps=sys.float_info.epsilon):
159
- '''
160
- Computes F1 value
161
- :param p: Precision Value
162
- :param r: Recall Value
163
- :return:
164
- '''
165
- f1 = 2 * p * r / (p + r + eps)
166
- return f1
167
-
168
-
169
  def _compute_cosine_similarity(pred_embeds: NDArray, ref_embeds: NDArray) -> Tuple[float, float]:
170
  cosine_scores = cosine_similarity(pred_embeds, ref_embeds)
171
  precision_per_sentence_sim = np.max(cosine_scores, axis=-1)
@@ -173,6 +168,48 @@ def _compute_cosine_similarity(pred_embeds: NDArray, ref_embeds: NDArray) -> Tup
173
  return np.mean(precision_per_sentence_sim).item(), np.mean(recall_per_sentence_sim).item()
174
 
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
177
  class SemF1(evaluate.Metric):
178
  _MODEL_TYPE_TO_NAME = {
@@ -251,7 +288,8 @@ class SemF1(evaluate.Metric):
251
  """Optional: download external resources useful to compute the scores"""
252
  import nltk
253
  nltk.download("punkt", quiet=True)
254
- # if not nltk.data.find("tokenizers/punkt"):
 
255
 
256
 
257
  def _compute(
@@ -260,114 +298,71 @@ class SemF1(evaluate.Metric):
260
  references,
261
  model_type: Optional[str] = None,
262
  tokenize_sentences: bool = True,
 
263
  gpu: Union[bool, int] = False,
264
  batch_size: int = 32,
265
- ):
266
-
267
- # Ensure gpu index is within the range of total available gpus
268
- gpu_available = True if torch.cuda.is_available() else False
269
- if gpu_available:
270
- gpu_count = torch.cuda.device_count()
271
- if isinstance(gpu, int) and gpu >= gpu_count:
272
- raise ValueError(
273
- f"There are {gpu_count} gpus available. Provide the correct gpu index. You provided: {gpu}"
274
- )
275
-
276
- # get the device
277
- if gpu is False:
278
- device = "cpu"
279
- elif gpu is True and torch.cuda.is_available():
280
- device = 0 # by default run on device 0
281
- elif isinstance(gpu, int):
282
- device = gpu
283
- else: # This will never happen
284
- raise ValueError(f"gpu must be bool or int. Provided value: {gpu}")
285
-
286
- # TODO: Also have a check on references to ensure they are also in correct format
287
- # Ensure prediction documents are not already tokenized if tokenize_sentences is True
288
- if not isinstance(predictions[0], str) and tokenize_sentences:
289
- raise ValueError(f"Each prediction/reference should be a document i.e. when tokenize_sentences is True. "
290
- f"Currently, each prediction is of type {type(predictions[0])} ")
291
-
292
- # Check single reference or multi-reference case
293
- multi_references = False
294
- if tokenize_sentences:
295
- # references: List[List[reference]]
296
- if isinstance(references[0], list) and isinstance(references[0][0], str):
297
- multi_references = True
298
- else:
299
- # references: List[List[List[sentence]]]
300
- if (
301
- isinstance(references[0], list) and
302
- isinstance(references[0][0], list) and
303
- isinstance(references[0][0][0], str)
304
- ):
305
- multi_references = True
306
 
307
  # Get the encoder model
308
  model_name = self._get_model_name(model_type)
309
- encoder = _get_encoder(model_name, device=device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
  # Init output scores
312
- precisions = [0] * len(predictions)
313
- recalls = [0] * len(predictions)
314
- f1_scores = [0] * len(predictions)
315
 
316
- # Compute Score in case of single reference
317
- if not multi_references:
318
- for idx, (pred, ref) in enumerate(zip(predictions, references)):
319
-
320
- # Sentence Tokenize prediction and reference
321
- if tokenize_sentences:
322
- ref = nltk.tokenize.sent_tokenize(ref) # List[str]
323
- pred = nltk.tokenize.sent_tokenize(pred) # List[str]
324
-
325
- pred_sent_count = len(pred)
326
- embeddings = encoder.encode(pred + ref)
327
- pred_embeddings = embeddings[:pred_sent_count]
328
- ref_embeddings = embeddings[pred_sent_count:]
329
-
330
- p, r = _compute_cosine_similarity(pred_embeddings, ref_embeddings)
331
- f1 = _compute_f1(p, r)
332
- precisions[idx] = p
333
- recalls[idx] = r
334
- f1_scores[idx] = f1
335
-
336
- else:
337
- # Compute Score in case of multiple reference
338
- for idx, (pred, refs) in enumerate(zip(predictions, references)):
339
- # Sentence Tokenize prediction and reference
340
- if tokenize_sentences:
341
- refs = [nltk.tokenize.sent_tokenize(ref) for ref in refs] # List[List[str]]
342
- pred = nltk.tokenize.sent_tokenize(pred) # List[str]
343
-
344
- ref_count = len(refs)
345
- pred_sent_count = len(pred)
346
- ref_sent_counts = [0] + [len(ref) for ref in refs]
347
- cumsum_ref_sent_counts = np.cumsum(ref_sent_counts)
348
-
349
- all_sentences = pred + sum(refs, [])
350
- embeddings = encoder.encode(all_sentences)
351
- pred_embeddings = embeddings[:pred_sent_count]
352
- ref_embeddings = [
353
- embeddings[pred_sent_count + cumsum_ref_sent_counts[c_idx]:
354
- pred_sent_count + cumsum_ref_sent_counts[c_idx + 1]]
355
- for c_idx in range(ref_count)
356
- ]
357
- # pred_embeddings = encoder.encode(pred)
358
- # ref_embeddings = [encoder.encode(refs) for ref in refs]
359
-
360
- # Precision: Concatenate all the sentences in all the references
361
- concat_ref_embeddings = np.concatenate(ref_embeddings, axis=0)
362
- p, _ = _compute_cosine_similarity(pred_embeddings, concat_ref_embeddings)
363
-
364
- # Recall: Compute individually for each reference
365
- scores = [_compute_cosine_similarity(r_embeds, pred_embeddings) for r_embeds in ref_embeddings]
366
- r = np.mean([r_scores for (r_scores, _) in scores]).item()
367
-
368
- f1 = _compute_f1(p, r)
369
- precisions[idx] = p # TODO: check why idx says invalid type
370
- recalls[idx] = r
371
- f1_scores[idx] = f1
372
-
373
- return {"precision": precisions, "recall": recalls, "f1": f1_scores}
 
26
  from sentence_transformers import SentenceTransformer
27
  from sklearn.metrics.pairwise import cosine_similarity
28
  import torch
29
+ from tqdm import tqdm
30
+
31
+ from utils import is_list_of_strings_at_depth, Scores, slice_embeddings, flatten_list
32
 
33
  _CITATION = """\
34
  @inproceedings{bansal-etal-2022-sem,
 
123
  [0.77, 0.56]
124
  """
125
 
126
+ _PREDICTION_TYPE = Union[List[str], List[List[str]]]
127
+ _REFERENCE_TYPE = Union[List[str], List[List[str]], List[List[List[str]]]]
128
+
129
 
130
  class Encoder(metaclass=abc.ABCMeta):
131
  @abc.abstractmethod
 
155
 
156
  def _get_encoder(model_name: str, device: Union[str, int], batch_size: int) -> Encoder:
157
  if model_name == "use":
158
+ return SBertEncoder(model_name, device, batch_size)
159
  # return USE() # TODO: This will change depending on PyTorch USE VS TF USE model
160
  else:
161
  return SBertEncoder(model_name, device, batch_size)
162
 
163
 
 
 
 
 
 
 
 
 
 
 
 
164
  def _compute_cosine_similarity(pred_embeds: NDArray, ref_embeds: NDArray) -> Tuple[float, float]:
165
  cosine_scores = cosine_similarity(pred_embeds, ref_embeds)
166
  precision_per_sentence_sim = np.max(cosine_scores, axis=-1)
 
168
  return np.mean(precision_per_sentence_sim).item(), np.mean(recall_per_sentence_sim).item()
169
 
170
 
171
+ def _get_gpu(gpu: Union[bool, int]) -> Union[str, int]:
172
+ # Ensure gpu index is within the range of total available gpus
173
+ gpu_available = torch.cuda.is_available()
174
+ if gpu_available:
175
+ gpu_count = torch.cuda.device_count()
176
+ if isinstance(gpu, int) and gpu >= gpu_count:
177
+ raise ValueError(
178
+ f"There are {gpu_count} gpus available. Provide the correct gpu index. You provided: {gpu}"
179
+ )
180
+
181
+ # get the device
182
+ if gpu is False:
183
+ device = "cpu"
184
+ elif gpu is True and gpu_available:
185
+ device = 0 # by default run on device 0
186
+ elif isinstance(gpu, int):
187
+ device = gpu
188
+ else: # This will never happen
189
+ raise ValueError(f"gpu must be bool or int. Provided value: {gpu}")
190
+
191
+ return device
192
+
193
+
194
+ def _validate_input_format(
195
+ tokenize_sentences: bool,
196
+ multi_references: bool,
197
+ predictions: _PREDICTION_TYPE,
198
+ references: _REFERENCE_TYPE,
199
+ ):
200
+ if tokenize_sentences and multi_references:
201
+ condition = is_list_of_strings_at_depth(predictions, 1) and is_list_of_strings_at_depth(references, 2)
202
+ elif not tokenize_sentences and multi_references:
203
+ condition = is_list_of_strings_at_depth(predictions, 2) and is_list_of_strings_at_depth(references, 3)
204
+ elif tokenize_sentences and not multi_references:
205
+ condition = is_list_of_strings_at_depth(predictions, 1) and is_list_of_strings_at_depth(references, 1)
206
+ else:
207
+ condition = is_list_of_strings_at_depth(predictions, 2) and is_list_of_strings_at_depth(references, 2)
208
+
209
+ if not condition:
210
+ raise ValueError("Predictions are references are not valid input format. Refer to documentation.")
211
+
212
+
213
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
214
  class SemF1(evaluate.Metric):
215
  _MODEL_TYPE_TO_NAME = {
 
288
  """Optional: download external resources useful to compute the scores"""
289
  import nltk
290
  nltk.download("punkt", quiet=True)
291
+ # if not nltk.data.find("tokenizers/punkt"): # TODO: check why it is not working
292
+ # pass
293
 
294
 
295
  def _compute(
 
298
  references,
299
  model_type: Optional[str] = None,
300
  tokenize_sentences: bool = True,
301
+ multi_references: bool = False,
302
  gpu: Union[bool, int] = False,
303
  batch_size: int = 32,
304
+ ) -> List[Scores]:
305
+ """
306
+ Compute precision, recall, and F1 scores for given predictions and references.
307
+
308
+ :param predictions
309
+ :param references
310
+ :param model_type: Type of model to use for encoding.
311
+ :param tokenize_sentences: Flag to sentence tokenize the document.
312
+ :param multi_references: Flag to indicate multiple references.
313
+ :param gpu: GPU device to use.
314
+ :param batch_size: Batch size for encoding.
315
+
316
+ :return: List of Scores dataclass with precision, recall, and F1 scores.
317
+ """
318
+
319
+ # Validate inputs corresponding to flags
320
+ _validate_input_format(tokenize_sentences, multi_references, predictions, references)
321
+
322
+ # Get GPU
323
+ device = _get_gpu(gpu)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  # Get the encoder model
326
  model_name = self._get_model_name(model_type)
327
+ encoder = _get_encoder(model_name, device=device, batch_size=batch_size)
328
+
329
+ # We'll handle the single reference and multi-reference case same way. So change the data format accordingly
330
+ if not multi_references:
331
+ references = [[ref] for ref in references]
332
+
333
+ # Tokenize sentences if required
334
+ if tokenize_sentences:
335
+ predictions = [nltk.tokenize.sent_tokenize(pred) for pred in predictions]
336
+ references = [[nltk.tokenize.sent_tokenize(ref) for ref in refs] for refs in references]
337
+
338
+ # Flatten the data for batch processing
339
+ all_sentences = flatten_list(predictions) + flatten_list(references)
340
+
341
+ # Get num of sentences to get the corresponding embeddings
342
+ prediction_sentences_count = [len(pred) for pred in predictions]
343
+ reference_sentences_count = [[len(ref) for ref in refs] for refs in references]
344
+
345
+ # Note: This is the most optimal way of doing it
346
+ # Encode all sentences in one go
347
+ embeddings = encoder.encode(all_sentences)
348
+
349
+ # Get embeddings corresponding to predictions and references
350
+ pred_embeddings = slice_embeddings(embeddings, prediction_sentences_count)
351
+ ref_embeddings = slice_embeddings(embeddings[sum(prediction_sentences_count):], reference_sentences_count)
352
 
353
  # Init output scores
354
+ results = []
 
 
355
 
356
+ # Compute scores
357
+ for preds, refs in zip(pred_embeddings, ref_embeddings):
358
+ # Precision: Concatenate all the sentences in all the references
359
+ concat_refs = np.concatenate(refs, axis=0)
360
+ precision, _ = _compute_cosine_similarity(preds, concat_refs)
361
+
362
+ # Recall: Compute individually for each reference
363
+ recall_scores = [_compute_cosine_similarity(r_embeds, preds) for r_embeds in refs]
364
+ recall_scores = [r_scores for (r_scores, _) in recall_scores]
365
+
366
+ results.append(Scores(precision, recall_scores))
367
+
368
+ return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import statistics
3
+ import sys
4
+ from typing import List, Union
5
+
6
+ from numpy.typing import NDArray
7
+
8
+
9
+ NumSentencesType = Union[List[int], List[List[int]]]
10
+ EmbeddingSlicesType = Union[List[NDArray], List[List[NDArray]]]
11
+
12
+
13
+ def slice_embeddings(embeddings: NDArray, num_sentences: NumSentencesType) -> EmbeddingSlicesType:
14
+ def _slice_embeddings(s_idx: int, n_sentences: List[int]):
15
+ _result = []
16
+ for count in n_sentences:
17
+ _result.append(embeddings[s_idx:s_idx + count])
18
+ s_idx += count
19
+ return _result, s_idx
20
+
21
+ if isinstance(num_sentences, list) and all(isinstance(item, int) for item in num_sentences):
22
+ result, _ = _slice_embeddings(0, num_sentences)
23
+ return result
24
+ elif isinstance(num_sentences, list) and all(
25
+ isinstance(sublist, list) and all(
26
+ isinstance(item, int) for item in sublist
27
+ )
28
+ for sublist in num_sentences
29
+ ):
30
+ nested_result = []
31
+ start_idx = 0
32
+ for nested_num_sentences in num_sentences:
33
+ embedding_slice, start_idx = _slice_embeddings(start_idx, nested_num_sentences)
34
+ nested_result.append(embedding_slice)
35
+
36
+ return nested_result
37
+ else:
38
+ raise TypeError(f"Incorrect Type for {num_sentences=}")
39
+
40
+
41
+ def is_list_of_strings_at_depth(obj, depth: int) -> bool:
42
+ if depth == 0:
43
+ return isinstance(obj, str)
44
+ elif depth > 0:
45
+ return isinstance(obj, list) and all(is_list_of_strings_at_depth(item, depth - 1) for item in obj)
46
+ else:
47
+ raise ValueError("Depth can't be negative")
48
+
49
+
50
+ def flatten_list(nested_list: list) -> list:
51
+ """
52
+ Recursively flattens a nested list of any depth.
53
+
54
+ Parameters:
55
+ nested_list (list): The nested list to flatten.
56
+
57
+ Returns:
58
+ list: A flat list containing all the elements of the nested list.
59
+ """
60
+ flat_list = []
61
+ for item in nested_list:
62
+ if isinstance(item, list):
63
+ flat_list.extend(flatten_list(item))
64
+ else:
65
+ flat_list.append(item)
66
+ return flat_list
67
+
68
+
69
+ def compute_f1(p: float, r: float, eps=sys.float_info.epsilon) -> float:
70
+ """
71
+ Computes F1 value
72
+ :param p: Precision Value
73
+ :param r: Recall Value
74
+ :param eps: Epsilon Value
75
+ :return:
76
+ """
77
+ f1 = 2 * p * r / (p + r + eps)
78
+ return f1
79
+
80
+
81
+ @dataclass
82
+ class Scores:
83
+ precision: float
84
+ recall: List[float]
85
+
86
+ def __post_init__(self):
87
+ self.f1: float = compute_f1(self.precision, statistics.fmean(self.recall))