Elron commited on
Commit
3c5feb8
1 Parent(s): 25f46be

Upload metrics.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. metrics.py +594 -133
metrics.py CHANGED
@@ -1,14 +1,18 @@
 
1
  import re
2
  import string
3
  import uuid
4
- from abc import ABC, abstractmethod
5
  from collections import Counter
6
  from dataclasses import field
7
  from typing import Any, Dict, Generator, List, Optional, Tuple
8
 
9
  import evaluate
10
  import numpy
 
 
11
 
 
12
  from .dataclass import InternalField, OptionalField
13
  from .operator import (
14
  MultiStreamOperator,
@@ -17,8 +21,14 @@ from .operator import (
17
  StreamInstanceOperator,
18
  )
19
  from .operators import CopyFields
 
20
  from .stream import MultiStream, Stream
21
 
 
 
 
 
 
22
 
23
  def abstract_factory():
24
  return {}
@@ -31,23 +41,166 @@ def abstract_field():
31
  class UpdateStream(StreamInstanceOperator):
32
  update: dict
33
 
34
- def process(self, instance: Dict[str, Any], stream_name: str = None) -> Dict[str, Any]:
 
 
35
  instance.update(self.update)
36
  return instance
37
 
38
 
39
  # TODO: currently we have two classes with this name. metric.Metric and matrics.Metric...
40
- class Metric(ABC):
41
  @property
42
  @abstractmethod
43
  def main_score(self):
44
  pass
45
 
46
 
47
- class GlobalMetric(SingleStreamOperator, Metric):
48
- def process(self, stream: Stream, stream_name: str = None) -> Generator:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  references = []
50
  predictions = []
 
51
  global_score = {}
52
 
53
  instances = []
@@ -58,58 +211,100 @@ class GlobalMetric(SingleStreamOperator, Metric):
58
  else:
59
  global_score = instance["score"]["global"]
60
 
61
- refs, pred = instance["references"], instance["prediction"]
 
 
 
 
 
 
62
 
 
 
 
 
63
  try:
64
- instance_score = self._compute([refs], [pred])
 
 
 
 
65
  except:
66
  instance_score = {"score": None, "score_name": self.main_score}
67
 
68
- if isinstance(self.main_score, str) and self.main_score is not None:
69
  instance_score[self.main_score] = None
70
 
71
  instance["score"]["instance"].update(instance_score)
72
 
73
- references.append(refs)
74
- predictions.append(pred)
75
- instances.append(instance)
76
-
77
- result = self._compute(references, predictions)
78
 
79
  global_score.update(result)
80
 
 
 
 
 
 
 
81
  for instance in instances:
82
  instance["score"]["global"] = global_score
83
  yield instance
84
 
85
- def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
86
- result = self.compute(references, predictions)
 
 
 
 
 
87
  result["score"] = result[self.main_score]
88
  result["score_name"] = self.main_score
89
  return result
90
 
91
  @abstractmethod
92
- def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
 
 
 
 
 
93
  pass
94
 
95
 
96
- class BulkInstanceMetric(SingleStreamOperator, Metric):
 
97
  main_score: str
98
  reduction_map: Dict[str, List[str]]
99
 
100
  implemented_reductions: List[str] = field(default_factory=lambda: ["mean"])
101
 
102
- def process(self, stream: Stream, stream_name: str = None) -> Generator:
103
  global_score = {}
104
  instances = []
105
 
106
  # consume the stream
107
  references, predictions = map(
108
- list, zip(*[(instance["references"], instance["prediction"]) for instance in stream])
 
 
 
 
 
 
109
  )
110
 
 
 
 
 
 
111
  # compute the metric over all refs and preds
112
- instance_scores = self.compute(references=references, predictions=predictions)
 
 
 
 
113
 
114
  # add the score and score_name fields
115
  for instance_score in instance_scores:
@@ -134,21 +329,38 @@ class BulkInstanceMetric(SingleStreamOperator, Metric):
134
  if reduction == "mean":
135
  from statistics import mean
136
 
137
- for field in fields:
138
- global_score[field] = mean([instance["score"]["instance"][field] for instance in instances])
139
- if field == self.main_score:
140
- global_score["score"] = global_score[field]
 
 
 
 
 
141
  global_score["score_name"] = self.main_score
142
 
 
 
 
 
 
143
  for instance in instances:
144
  yield instance
145
 
146
  @abstractmethod
147
- def compute(self, references: List[List[Any]], predictions: List[Any]) -> Dict[str, Any]:
 
 
 
 
 
148
  pass
149
 
150
 
151
- class InstanceMetric(SingleStreamOperator, Metric):
 
 
152
  implemented_reductions: List[str] = field(default_factory=lambda: ["mean"])
153
 
154
  @property
@@ -156,15 +368,21 @@ class InstanceMetric(SingleStreamOperator, Metric):
156
  def reduction_map(self) -> dict:
157
  pass
158
 
159
- def process(self, stream: Stream, stream_name: str = None) -> Generator:
160
  global_score = {}
161
  instances = []
162
 
163
  for instance in stream:
164
  refs, pred = instance["references"], instance["prediction"]
 
 
 
165
 
166
- instance_score = self._compute(refs, pred)
167
-
 
 
 
168
  if "score" not in instance:
169
  instance["score"] = {"global": global_score, "instance": {}}
170
  else:
@@ -182,23 +400,28 @@ class InstanceMetric(SingleStreamOperator, Metric):
182
  if reduction == "mean":
183
  from statistics import mean
184
 
185
- for field in fields:
186
- global_score[field] = mean([instance["score"]["instance"][field] for instance in instances])
187
- if field == self.main_score:
188
- global_score["score"] = global_score[field]
 
 
 
 
189
  global_score["score_name"] = self.main_score
190
 
 
 
 
 
 
191
  for instance in instances:
192
  yield instance
193
 
194
- def _compute(self, references: List[str], prediction: str) -> dict:
195
- result = self.compute(references=references, prediction=prediction)
196
- result["score"] = result[self.main_score]
197
- result["score_name"] = self.main_score
198
- return result
199
-
200
  @abstractmethod
201
- def compute(self, references: List[str], prediction: str) -> dict:
 
 
202
  pass
203
 
204
 
@@ -208,46 +431,54 @@ class Squad(GlobalMetric):
208
  metric = "squad"
209
 
210
  def prepare(self):
211
- super(Squad, self).prepare()
212
  self._metric = evaluate.load(self.metric)
213
 
214
- def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
 
 
 
 
 
215
  ids = [str(uuid.uuid4()).replace("-", "") for _ in range(len(predictions))]
216
  formatted_predictions = [
217
- {"prediction_text": prediction, "id": ids[i]} for i, prediction in enumerate(predictions)
 
218
  ]
219
  formatted_references = [
220
  {"answers": {"answer_start": [-1], "text": reference}, "id": ids[i]}
221
  for i, reference in enumerate(references)
222
  ]
223
 
224
- return self._metric.compute(predictions=formatted_predictions, references=formatted_references)
225
-
226
-
227
- class SingleReferenceInstanceMetric(InstanceMetric):
228
- def _compute(self, references: List[str], prediction: str) -> dict:
229
- result = self.compute(references[0], prediction)
230
- result["score"] = result[self.main_score]
231
- result["score_name"] = self.main_score
232
- return result
233
-
234
- @abstractmethod
235
- def compute(self, reference, prediction: str) -> dict:
236
- pass
237
 
238
 
239
- class Accuracy(SingleReferenceInstanceMetric):
240
  reduction_map = {"mean": ["accuracy"]}
241
  main_score = "accuracy"
242
 
243
- def compute(self, reference, prediction: str) -> dict:
244
- return {"accuracy": float(str(reference) == str(prediction))}
 
 
 
 
 
 
 
 
 
245
 
246
 
247
  class MetricPipeline(MultiStreamOperator, Metric):
248
  main_score: str = None
249
  preprocess_steps: Optional[List[StreamingOperator]] = field(default_factory=list)
250
- postpreprocess_steps: Optional[List[StreamingOperator]] = field(default_factory=list)
 
 
251
  metric: Metric = None
252
 
253
  def verify(self):
@@ -269,14 +500,15 @@ class MetricPipeline(MultiStreamOperator, Metric):
269
  multi_stream = self.metric(multi_stream)
270
  for step in self.postpreprocess_steps:
271
  multi_stream = step(multi_stream)
272
- multi_stream = self.prepare_score(multi_stream)
273
- return multi_stream
274
 
275
 
276
  class HuggingfaceMetric(GlobalMetric):
277
  hf_metric_name: str = None
278
  main_score: str = None # The main score returned from the metric
279
- hf_main_score: str = None # USed if HF returns uses a different score name for the main metric
 
 
280
 
281
  scale: float = 1.0 # optional scaling of main results
282
  scaled_fields: list = None
@@ -285,24 +517,39 @@ class HuggingfaceMetric(GlobalMetric):
285
 
286
  def prepare(self):
287
  super().prepare()
288
- self.metric = evaluate.load(self.hf_metric_name, experiment_id=self.experiment_id)
 
 
289
 
290
- def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
291
- result = self.metric.compute(predictions=predictions, references=references, **self.hf_compute_args)
 
 
 
 
 
 
 
292
  if self.hf_main_score:
293
  result[self.main_score] = result[self.hf_main_score]
294
  del result[self.hf_main_score]
295
  if self.scale != 1.0:
296
- assert self.scaled_fields is not None, f"Scaling factor was set to {self.scale}, but no fields specified"
 
 
297
  for key in self.scaled_fields:
298
- assert key in result, f"Trying to scale field '{key}' which is not in results of metrics: {result}"
 
 
299
  if isinstance(result[key], list):
300
  assert all(
301
  isinstance(v, float) for v in result[key]
302
  ), "Not all scaled field '{key}' values are floats: {result[key]}"
303
  result[key] = [v / self.scale for v in result[key]]
304
  else:
305
- assert isinstance(result[key], float), "Scaled field '{key}' is not float: {result[key]}"
 
 
306
  result[key] /= self.scale
307
  return result
308
 
@@ -317,8 +564,15 @@ class HuggingfaceBulkMetric(BulkInstanceMetric):
317
  super().prepare()
318
  self.metric = evaluate.load(self.hf_metric_name)
319
 
320
- def compute(self, references: List[List[str]], predictions: List[str]) -> List[Dict[str, Any]]:
321
- scores = self.metric.compute(predictions=predictions, references=references, **self.hf_compute_args)
 
 
 
 
 
 
 
322
 
323
  # convert dict of lists to a list of dicts
324
  results = [{} for _ in range(len(scores[self.hf_metric_fields[0]]))]
@@ -337,7 +591,7 @@ class F1(GlobalMetric):
337
  metric = "f1"
338
 
339
  def prepare(self):
340
- super(F1, self).prepare()
341
  self._metric = evaluate.load(self.metric)
342
 
343
  def get_str_id(self, str):
@@ -347,18 +601,30 @@ class F1(GlobalMetric):
347
  self.id_to_str[id] = str
348
  return self.str_to_id[str]
349
 
350
- def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
 
 
 
 
 
351
  assert all(
352
  len(reference) == 1 for reference in references
353
  ), "Only a single reference per prediction is allowed in F1 metric"
354
  self.str_to_id = {}
355
  self.id_to_str = {}
356
- formatted_references = [self.get_str_id(reference[0]) for reference in references]
357
- unique_labels = self.str_to_id.keys()
358
- formatted_predictions = [self.get_str_id(prediction) for prediction in predictions]
 
 
 
 
359
  labels = list(set(formatted_references))
360
  result = self._metric.compute(
361
- predictions=formatted_predictions, references=formatted_references, labels=labels, average=self.average
 
 
 
362
  )
363
  if isinstance(result["f1"], numpy.ndarray):
364
  from statistics import mean
@@ -380,6 +646,11 @@ class F1Macro(F1):
380
  main_score = "f1_macro"
381
 
382
 
 
 
 
 
 
383
  class F1MultiLabel(GlobalMetric):
384
  _metric = None
385
  main_score = "f1_macro"
@@ -387,11 +658,11 @@ class F1MultiLabel(GlobalMetric):
387
  classes_to_ignore = ["none"]
388
 
389
  def prepare(self):
390
- super(F1MultiLabel, self).prepare()
391
  self._metric = evaluate.load("f1", "multilabel")
392
 
393
  def add_str_to_id(self, str):
394
- if not str in self.str_to_id:
395
  id = len(self.str_to_id)
396
  self.str_to_id[str] = id
397
  self.id_to_str[id] = str
@@ -404,17 +675,34 @@ class F1MultiLabel(GlobalMetric):
404
  result[self.str_to_id[label]] = 1
405
  return result
406
 
407
- def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
 
 
 
 
 
408
  self.str_to_id = {}
409
  self.id_to_str = {}
410
  assert all(
411
  len(reference) == 1 for reference in references
412
- ), "Only a single reference per prediction is allowed in F1 metric"
 
413
  references = [reference[0] for reference in references]
 
 
 
 
 
 
 
 
 
 
 
414
  labels = [
415
- l
416
- for l in set([label for reference in references for label in reference])
417
- if l not in self.classes_to_ignore
418
  ]
419
  # if no classes are left then F1 is not defined
420
  # (e.g. only "none" in references)
@@ -423,8 +711,12 @@ class F1MultiLabel(GlobalMetric):
423
 
424
  for label in labels:
425
  self.add_str_to_id(label)
426
- formatted_references = [self.get_one_hot_vector(reference) for reference in references]
427
- formatted_predictions = [self.get_one_hot_vector(prediction) for prediction in predictions]
 
 
 
 
428
 
429
  # There is odd behavior in scikit-learn that when passing a one-hot vector with a single
430
  # element, it is treated a class identifier. Therefore, we add labels=[1] to limit to only
@@ -475,37 +767,53 @@ class Rouge(HuggingfaceMetric):
475
  sent_split_newline: bool = True
476
 
477
  def prepare(self):
478
- self.hf_compute_args.update({"use_aggregator": self.use_aggregator, "rouge_types": self.rouge_types})
479
-
480
  super().prepare()
 
 
 
 
 
481
  import nltk
482
 
483
  nltk.download("punkt")
484
  self.sent_tokenize = nltk.sent_tokenize
485
 
486
- def compute(self, references, predictions):
487
  if self.sent_split_newline:
488
- predictions = ["\n".join(self.sent_tokenize(prediction.strip())) for prediction in predictions]
489
- references = [["\n".join(self.sent_tokenize(r.strip())) for r in reference] for reference in references]
490
- return super().compute(references, predictions)
491
-
492
-
493
- # Computes chat edit distance, ignoring whitespace
494
- class CharEditDistanceAccuracy(SingleReferenceInstanceMetric):
 
 
 
 
 
 
495
  reduction_map = {"mean": ["char_edit_dist_accuracy"]}
496
  main_score = "char_edit_dist_accuracy"
497
 
498
  def prepare(self):
 
499
  import editdistance
500
 
501
  self.eval = editdistance.eval
502
 
503
- def compute(self, reference, prediction: str) -> dict:
 
 
 
 
 
 
504
  formatted_prediction = "".join(prediction.split())
505
- formatted_reference = "".join(reference.split())
506
  max_length = max(len(formatted_reference), len(formatted_prediction))
507
  if max_length == 0:
508
- return 0
509
  edit_dist = self.eval(formatted_reference, formatted_prediction)
510
  return {"char_edit_dist_accuracy": (1 - edit_dist / max_length)}
511
 
@@ -514,12 +822,19 @@ class Wer(HuggingfaceMetric):
514
  hf_metric_name = "wer"
515
  main_score = "wer"
516
 
517
- def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
 
 
 
 
 
518
  assert all(
519
  len(reference) == 1 for reference in references
520
  ), "Only single reference per prediction is allowed in wer metric"
521
  formatted_references = [reference[0] for reference in references]
522
- result = self.metric.compute(predictions=predictions, references=formatted_references)
 
 
523
  return {self.main_score: result}
524
 
525
 
@@ -534,16 +849,27 @@ class MatthewsCorrelation(HuggingfaceMetric):
534
  self.str_to_id[str] = id
535
  return self.str_to_id[str]
536
 
537
- def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
538
- formatted_references = [self.get_str_id(reference[0]) for reference in references]
539
- formatted_predictions = [self.get_str_id(prediction) for prediction in predictions]
540
- result = self.metric.compute(predictions=formatted_predictions, references=formatted_references)
541
- return result
 
 
 
 
 
 
 
 
 
 
542
 
543
 
544
  class CustomF1(GlobalMetric):
545
  main_score = "f1_micro"
546
  classes = None
 
547
 
548
  @abstractmethod
549
  def get_element_group(self, element):
@@ -553,40 +879,64 @@ class CustomF1(GlobalMetric):
553
  def get_element_representation(self, element):
554
  pass
555
 
556
- def group_elements(self, l):
557
  return {
558
- k: Counter([self.get_element_representation(value) for value in l if self.get_element_group(value) == k])
559
- for k in set([self.get_element_group(e) for e in l])
 
 
 
 
 
 
560
  }
561
 
562
  def calculate_groups_ratio(self, actual_group, total_group):
563
- return sum([min(actual_group[k], total_group[k]) for k in actual_group.keys()]), sum(actual_group.values())
 
 
 
 
 
 
 
 
564
 
565
  def f1(self, pn, pd, rn, rd):
566
- precision = 1.0 if pn == 0 and pd == 0 else pn / pd
567
- recall = 1.0 if rn == 0 and rd == 0 else rn / rd
568
  try:
569
  return 2 * precision * recall / (precision + recall)
570
  except ZeroDivisionError:
571
- return 0.0
572
-
573
- def compute(self, references: List[Any], predictions: List[Any]) -> dict:
 
 
 
 
 
574
  # in case reference are List[List[List[Any]]] and predictions are List[List[Any]]:
575
  if isinstance(references[0], list) and isinstance(references[0][0], list):
576
  references = [element[0] for element in references]
577
 
578
  assert len(references) == len(predictions), (
579
- f"references size ({len(references)})" f" doesn't mach predictions sise ({len(references)})."
 
580
  )
581
  if self.classes is None:
582
- classes = set([self.get_element_group(e) for sublist in references for e in sublist])
 
 
583
  else:
584
  classes = self.classes
585
- groups_statistics = dict()
586
  for references_batch, predictions_batch in zip(references, predictions):
587
  grouped_references = self.group_elements(references_batch)
588
  grouped_predictions = self.group_elements(predictions_batch)
589
- all_groups = set(grouped_references.keys()).union(grouped_predictions.keys())
 
 
590
  for group in all_groups:
591
  if group not in groups_statistics:
592
  groups_statistics[group] = {
@@ -608,9 +958,11 @@ class CustomF1(GlobalMetric):
608
  groups_statistics[group]["recall_numerator"] += rn
609
  groups_statistics[group]["recall_denominator"] += rd
610
 
611
- result = {}
612
  num_of_unknown_class_predictions = 0
613
  pn_total = pd_total = rn_total = rd_total = 0
 
 
 
614
  for group in groups_statistics.keys():
615
  pn, pd, rn, rd = (
616
  groups_statistics[group]["precision_numerator"],
@@ -618,22 +970,45 @@ class CustomF1(GlobalMetric):
618
  groups_statistics[group]["recall_numerator"],
619
  groups_statistics[group]["recall_denominator"],
620
  )
621
- pn_total, pd_total, rn_total, rd_total = pn_total + pn, pd_total + pd, rn_total + rn, rd_total + rd
 
 
 
 
 
622
  if group in classes:
623
- result[f"f1_{group}"] = self.f1(pn, pd, rn, rd)
 
 
624
  else:
625
  num_of_unknown_class_predictions += pd
 
 
626
  try:
627
- result["f1_macro"] = sum(result.values()) / len(result.keys())
 
 
 
 
 
 
628
  except ZeroDivisionError:
629
- result["f1_macro"] = 1.0
 
 
630
 
631
  amount_of_predictions = pd_total
632
  if amount_of_predictions == 0:
633
  result["in_classes_support"] = 1.0
634
  else:
635
- result["in_classes_support"] = 1.0 - num_of_unknown_class_predictions / amount_of_predictions
636
- result[f"f1_micro"] = self.f1(pn_total, pd_total, rn_total, rd_total)
 
 
 
 
 
 
637
  return result
638
 
639
 
@@ -668,11 +1043,20 @@ class TokenOverlap(InstanceMetric):
668
  reduction_map = {"mean": ["f1", "precision", "recall"]}
669
  main_score = "f1"
670
 
671
- def compute(self, references: List[Any], prediction: Any) -> dict:
672
- results = [self._compute_single_ref(reference, prediction) for reference in references]
673
- return {measure: max(r[i] for r in results) for i, measure in enumerate(["precision", "recall", "f1"])}
 
 
 
 
 
 
 
674
 
675
- def _compute_single_ref(self, reference: Any, prediction: Any) -> Tuple[float, float, float]:
 
 
676
  prediction_tokens = normalize_answer(prediction).split()
677
  reference_tokens = normalize_answer(reference).split()
678
  common = Counter(prediction_tokens) & Counter(reference_tokens)
@@ -713,7 +1097,12 @@ class SentenceBert(BulkInstanceMetric):
713
  self.model = SentenceTransformer(self.model_name)
714
  self.util = sbert_util
715
 
716
- def compute(self, references: List[List[Any]], predictions: List[Any]) -> List[Any]:
 
 
 
 
 
717
  scores = []
718
 
719
  # we are in a multi-reference case (each prediction may have multiple
@@ -728,7 +1117,9 @@ class SentenceBert(BulkInstanceMetric):
728
 
729
  # compute s-bert embeddings
730
  preds_emb = self.model.encode(predictions)
731
- refs_emb = self.model.encode([ref for ref_group in references for ref in ref_group])
 
 
732
 
733
  # for each candidate, pick the reference with the highest score
734
  for pred_emb, ref_group_bounds in zip(preds_emb, ref_group_boundaries):
@@ -746,11 +1137,17 @@ class Reward(BulkInstanceMetric):
746
  model_name: str
747
 
748
  def prepare(self):
 
749
  from transformers import pipeline
750
 
751
  self.pipe = pipeline("text-classification", model=self.model_name)
752
 
753
- def compute(self, references: List[List[Any]], predictions: List[Any]) -> List[Any]:
 
 
 
 
 
754
  # treat the references as the questions and the predictions as answers
755
  # assume a single reference
756
  questions = [refs[0] for refs in references]
@@ -762,3 +1159,67 @@ class Reward(BulkInstanceMetric):
762
  # compute the metric
763
  # add function_to_apply="none" to disable sigmoid
764
  return self.pipe(inputs, batch_size=self.batch_size)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
  import re
3
  import string
4
  import uuid
5
+ from abc import abstractmethod
6
  from collections import Counter
7
  from dataclasses import field
8
  from typing import Any, Dict, Generator, List, Optional, Tuple
9
 
10
  import evaluate
11
  import numpy
12
+ import numpy as np
13
+ from scipy.stats import bootstrap
14
 
15
+ from .artifact import Artifact
16
  from .dataclass import InternalField, OptionalField
17
  from .operator import (
18
  MultiStreamOperator,
 
21
  StreamInstanceOperator,
22
  )
23
  from .operators import CopyFields
24
+ from .random_utils import get_seed
25
  from .stream import MultiStream, Stream
26
 
27
+ # The default number of resamples used to estimate the confidence intervals
28
+ # global and instances metrics. Use None to disable confidence interval computation by default.
29
+ _N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS = 1000
30
+ _N_RESAMPLES_DEFAULT_FOR_GLOBAL_METRICS = 100
31
+
32
 
33
  def abstract_factory():
34
  return {}
 
41
  class UpdateStream(StreamInstanceOperator):
42
  update: dict
43
 
44
+ def process(
45
+ self, instance: Dict[str, Any], stream_name: Optional[str] = None
46
+ ) -> Dict[str, Any]:
47
  instance.update(self.update)
48
  return instance
49
 
50
 
51
  # TODO: currently we have two classes with this name. metric.Metric and matrics.Metric...
52
+ class Metric(Artifact):
53
  @property
54
  @abstractmethod
55
  def main_score(self):
56
  pass
57
 
58
 
59
+ class MetricWithConfidenceInterval(Metric):
60
+ # The number of resamples used to estimate the confidence intervals of this metric.
61
+ # Use None to disable confidence interval computation.
62
+ n_resamples: int = None
63
+ confidence_level: float = 0.95
64
+
65
+ @staticmethod
66
+ def new_random_generator():
67
+ # The np.random.default_rng expects a 32-bit int, while hash(..) can return a 64-bit integer.
68
+ # So use '& MAX_32BIT' to get a 32-bit seed.
69
+ _max_32bit = 2**32 - 1
70
+ return np.random.default_rng(hash(get_seed()) & _max_32bit)
71
+
72
+ def disable_confidence_interval_calculation(self):
73
+ self.n_resamples = None
74
+
75
+ def _can_compute_confidence_intervals(self, num_predictions):
76
+ return (
77
+ self.n_resamples is not None
78
+ and self.n_resamples > 1
79
+ and num_predictions > 1
80
+ )
81
+
82
+ def score_based_confidence_interval(self, score_names: List[str], instances):
83
+ """Compute confidence intervals based on existing scores, already computed on the input instances.
84
+
85
+ score_names: List[str]
86
+ Compute a confidence interval for each score_name from this list.
87
+ instances:
88
+ The instances for which the confidence intervals are computed.
89
+ """
90
+ from statistics import mean
91
+
92
+ result = {}
93
+
94
+ if not self._can_compute_confidence_intervals(num_predictions=len(instances)):
95
+ return result
96
+
97
+ for score_name in score_names:
98
+ scores = [
99
+ instance["score"]["instance"][score_name] for instance in instances
100
+ ]
101
+ ci = bootstrap(
102
+ (scores,),
103
+ statistic=mean,
104
+ n_resamples=self.n_resamples,
105
+ confidence_level=self.confidence_level,
106
+ random_state=self.new_random_generator(),
107
+ ).confidence_interval
108
+ result[f"{score_name}_ci_low"] = ci.low
109
+ result[f"{score_name}_ci_high"] = ci.high
110
+ if score_name == self.main_score:
111
+ result["score_ci_low"] = ci.low
112
+ result["score_ci_high"] = ci.high
113
+ return result
114
+
115
+ def compute_global_confidence_intervals(
116
+ self, references, predictions, additional_inputs, score_name
117
+ ):
118
+ """Computed confidence intervals for a set of references and predictions."""
119
+ random_gen = self.new_random_generator()
120
+
121
+ def statistic(arr, axis):
122
+ # arr is a 2d array where each row is a resampling, so we
123
+ # iterate over the rows and compute the metric on each resampling
124
+ def metric(sample_refs, sample_preds, sample_additional_inputs):
125
+ try:
126
+ return self._compute(
127
+ references=sample_refs,
128
+ predictions=sample_preds,
129
+ additional_inputs=sample_additional_inputs,
130
+ )["score"]
131
+ except Exception as e:
132
+ # this happens in edge cases, for example, when the sampling creates a
133
+ # sample where all strings are empty and this fails bleu.
134
+ logging.info(f"Warning in {self.__class__.__name__}", e)
135
+ return np.nan
136
+
137
+ scores = numpy.apply_along_axis(
138
+ lambda x: metric(
139
+ sample_refs=[references[i] for i in x],
140
+ sample_preds=[predictions[i] for i in x],
141
+ sample_additional_inputs=[additional_inputs[i] for i in x],
142
+ ),
143
+ axis=axis,
144
+ arr=arr,
145
+ )
146
+
147
+ # when running with bca interval (default), the statistic is called twice: with the
148
+ # original data and with the resamples. here we want to focus only on the latter.
149
+ if scores.size > 1:
150
+ # here we deal with samples on which the metric could not be computed. These are
151
+ # edge cases - for example, when the sample contains only empty strings.
152
+ # CI is about the distribution around the statistic (e.g. mean), it doesn't deal with
153
+ # cases in which the metric is not computable. Therefore, we ignore these edge cases
154
+ # as part of the computation of CI. The question is how to implement this policy.
155
+ # Options:
156
+ # 1. skip the errors and return a shorter array => this fails because Scipy demans
157
+ # this callback (i.e. the statistic() callback) to return an array of the same size
158
+ # as the number of resamples
159
+ # 2. Put np.nan for the errors => this fails because in such case the ci itself
160
+ # becomes np.nan. So one edge case can fail the whole CI computation.
161
+ # 3. Replace the errors with a sampling from the successful cases => this is what
162
+ # is implemented.
163
+ error_indices = numpy.isnan(scores)
164
+ n_errors = sum(error_indices)
165
+ if n_errors > 0:
166
+ new_scores = random_gen.choice(scores, n_errors, replace=True)
167
+ scores = scores[~error_indices]
168
+ scores = np.concatenate([scores, new_scores])
169
+
170
+ return scores
171
+
172
+ result = {}
173
+ num_predictions = len(predictions)
174
+ if self._can_compute_confidence_intervals(num_predictions=num_predictions):
175
+ identifiers = list(range(num_predictions))
176
+ ci = bootstrap(
177
+ (identifiers,),
178
+ statistic=statistic,
179
+ n_resamples=self.n_resamples,
180
+ confidence_level=self.confidence_level,
181
+ random_state=random_gen,
182
+ ).confidence_interval
183
+ result["score_ci_low"] = ci.low
184
+ result["score_ci_high"] = ci.high
185
+ result[f"{score_name}_ci_low"] = ci.low
186
+ result[f"{score_name}_ci_high"] = ci.high
187
+ return result
188
+
189
+
190
+ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
191
+ """A class for computing metrics that require joint calculations over all instances and are not just aggregation of scores of individuals instances.
192
+
193
+ For example, macro_F1 requires
194
+ calculation requires calculation of recall and precision per class, so all instances of the class
195
+ need to be considered. Accuracy, on the other hand, is just an average of the accuracy of all the instances.
196
+ """
197
+
198
+ n_resamples = _N_RESAMPLES_DEFAULT_FOR_GLOBAL_METRICS
199
+
200
+ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
201
  references = []
202
  predictions = []
203
+ additional_inputs = []
204
  global_score = {}
205
 
206
  instances = []
 
211
  else:
212
  global_score = instance["score"]["global"]
213
 
214
+ instance_references, instance_prediction = (
215
+ instance["references"],
216
+ instance["prediction"],
217
+ )
218
+ references.append(instance_references)
219
+ predictions.append(instance_prediction)
220
+ instances.append(instance)
221
 
222
+ instance_additional_inputs = (
223
+ instance["additional_inputs"] if "additional_inputs" in instance else {}
224
+ )
225
+ additional_inputs.append(instance_additional_inputs)
226
  try:
227
+ instance_score = self._compute(
228
+ [instance_references],
229
+ [instance_prediction],
230
+ [instance_additional_inputs],
231
+ )
232
  except:
233
  instance_score = {"score": None, "score_name": self.main_score}
234
 
235
+ if isinstance(self.main_score, str):
236
  instance_score[self.main_score] = None
237
 
238
  instance["score"]["instance"].update(instance_score)
239
 
240
+ result = self._compute(references, predictions, additional_inputs)
 
 
 
 
241
 
242
  global_score.update(result)
243
 
244
+ score_name = global_score["score_name"]
245
+ confidence_interval = self.compute_global_confidence_intervals(
246
+ references, predictions, additional_inputs, score_name
247
+ )
248
+ global_score.update(confidence_interval)
249
+
250
  for instance in instances:
251
  instance["score"]["global"] = global_score
252
  yield instance
253
 
254
+ def _compute(
255
+ self,
256
+ references: List[List[str]],
257
+ predictions: List[str],
258
+ additional_inputs: List[Any],
259
+ ) -> dict:
260
+ result = self.compute(references, predictions, additional_inputs)
261
  result["score"] = result[self.main_score]
262
  result["score_name"] = self.main_score
263
  return result
264
 
265
  @abstractmethod
266
+ def compute(
267
+ self,
268
+ references: List[List[Any]],
269
+ predictions: List[Any],
270
+ additional_inputs: List[Any],
271
+ ) -> dict:
272
  pass
273
 
274
 
275
+ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
276
+ n_resamples = _N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS
277
  main_score: str
278
  reduction_map: Dict[str, List[str]]
279
 
280
  implemented_reductions: List[str] = field(default_factory=lambda: ["mean"])
281
 
282
+ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
283
  global_score = {}
284
  instances = []
285
 
286
  # consume the stream
287
  references, predictions = map(
288
+ list,
289
+ zip(
290
+ *[
291
+ (instance["references"], instance["prediction"])
292
+ for instance in stream
293
+ ]
294
+ ),
295
  )
296
 
297
+ additional_inputs = [
298
+ instance["additional_inputs"] if "additional_inputs" in instance else {}
299
+ for instance in stream
300
+ ]
301
+
302
  # compute the metric over all refs and preds
303
+ instance_scores = self.compute(
304
+ references=references,
305
+ predictions=predictions,
306
+ additional_inputs=additional_inputs,
307
+ )
308
 
309
  # add the score and score_name fields
310
  for instance_score in instance_scores:
 
329
  if reduction == "mean":
330
  from statistics import mean
331
 
332
+ for field_name in fields:
333
+ global_score[field_name] = mean(
334
+ [
335
+ instance["score"]["instance"][field_name]
336
+ for instance in instances
337
+ ]
338
+ )
339
+ if field_name == self.main_score:
340
+ global_score["score"] = global_score[field_name]
341
  global_score["score_name"] = self.main_score
342
 
343
+ confidence_interval = self.score_based_confidence_interval(
344
+ score_names=[self.main_score], instances=instances
345
+ )
346
+ global_score.update(confidence_interval)
347
+
348
  for instance in instances:
349
  yield instance
350
 
351
  @abstractmethod
352
+ def compute(
353
+ self,
354
+ references: List[List[Any]],
355
+ predictions: List[Any],
356
+ additional_inputs: List[Dict],
357
+ ) -> Dict[str, Any]:
358
  pass
359
 
360
 
361
+ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
362
+ n_resamples = _N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS
363
+
364
  implemented_reductions: List[str] = field(default_factory=lambda: ["mean"])
365
 
366
  @property
 
368
  def reduction_map(self) -> dict:
369
  pass
370
 
371
+ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
372
  global_score = {}
373
  instances = []
374
 
375
  for instance in stream:
376
  refs, pred = instance["references"], instance["prediction"]
377
+ additional_inputs = (
378
+ instance["additional_inputs"] if "additional_inputs" in instance else {}
379
+ )
380
 
381
+ instance_score = self.compute(
382
+ references=refs, prediction=pred, additional_inputs=additional_inputs
383
+ )
384
+ instance_score["score"] = instance_score[self.main_score]
385
+ instance_score["score_name"] = self.main_score
386
  if "score" not in instance:
387
  instance["score"] = {"global": global_score, "instance": {}}
388
  else:
 
400
  if reduction == "mean":
401
  from statistics import mean
402
 
403
+ for field_name in fields:
404
+ scores = [
405
+ instance["score"]["instance"][field_name]
406
+ for instance in instances
407
+ ]
408
+ global_score[field_name] = mean(scores)
409
+ if field_name == self.main_score:
410
+ global_score["score"] = global_score[field_name]
411
  global_score["score_name"] = self.main_score
412
 
413
+ confidence_interval = self.score_based_confidence_interval(
414
+ score_names=[self.main_score], instances=instances
415
+ )
416
+ global_score.update(confidence_interval)
417
+
418
  for instance in instances:
419
  yield instance
420
 
 
 
 
 
 
 
421
  @abstractmethod
422
+ def compute(
423
+ self, references: List[Any], prediction: Any, additional_inputs: Dict
424
+ ) -> dict:
425
  pass
426
 
427
 
 
431
  metric = "squad"
432
 
433
  def prepare(self):
434
+ super().prepare()
435
  self._metric = evaluate.load(self.metric)
436
 
437
+ def compute(
438
+ self,
439
+ references: List[List[str]],
440
+ predictions: List[str],
441
+ additional_inputs: List[Dict],
442
+ ) -> dict:
443
  ids = [str(uuid.uuid4()).replace("-", "") for _ in range(len(predictions))]
444
  formatted_predictions = [
445
+ {"prediction_text": prediction, "id": ids[i]}
446
+ for i, prediction in enumerate(predictions)
447
  ]
448
  formatted_references = [
449
  {"answers": {"answer_start": [-1], "text": reference}, "id": ids[i]}
450
  for i, reference in enumerate(references)
451
  ]
452
 
453
+ return self._metric.compute(
454
+ predictions=formatted_predictions,
455
+ references=formatted_references,
456
+ )
 
 
 
 
 
 
 
 
 
457
 
458
 
459
+ class Accuracy(InstanceMetric):
460
  reduction_map = {"mean": ["accuracy"]}
461
  main_score = "accuracy"
462
 
463
+ def compute(
464
+ self, references: List[Any], prediction: Any, additional_inputs: List[Dict]
465
+ ) -> dict:
466
+ result = {
467
+ self.main_score: float(
468
+ str(prediction) in [str(reference) for reference in references]
469
+ )
470
+ }
471
+ result["score"] = result[self.main_score]
472
+ result["score_name"] = self.main_score
473
+ return result
474
 
475
 
476
  class MetricPipeline(MultiStreamOperator, Metric):
477
  main_score: str = None
478
  preprocess_steps: Optional[List[StreamingOperator]] = field(default_factory=list)
479
+ postpreprocess_steps: Optional[List[StreamingOperator]] = field(
480
+ default_factory=list
481
+ )
482
  metric: Metric = None
483
 
484
  def verify(self):
 
500
  multi_stream = self.metric(multi_stream)
501
  for step in self.postpreprocess_steps:
502
  multi_stream = step(multi_stream)
503
+ return self.prepare_score(multi_stream)
 
504
 
505
 
506
  class HuggingfaceMetric(GlobalMetric):
507
  hf_metric_name: str = None
508
  main_score: str = None # The main score returned from the metric
509
+ hf_main_score: str = (
510
+ None # USed if HF returns uses a different score name for the main metric
511
+ )
512
 
513
  scale: float = 1.0 # optional scaling of main results
514
  scaled_fields: list = None
 
517
 
518
  def prepare(self):
519
  super().prepare()
520
+ self.metric = evaluate.load(
521
+ self.hf_metric_name, experiment_id=self.experiment_id
522
+ )
523
 
524
+ def compute(
525
+ self,
526
+ references: List[List[Any]],
527
+ predictions: List[Any],
528
+ additional_inputs: List[Dict],
529
+ ) -> dict:
530
+ result = self.metric.compute(
531
+ predictions=predictions, references=references, **self.hf_compute_args
532
+ )
533
  if self.hf_main_score:
534
  result[self.main_score] = result[self.hf_main_score]
535
  del result[self.hf_main_score]
536
  if self.scale != 1.0:
537
+ assert (
538
+ self.scaled_fields is not None
539
+ ), f"Scaling factor was set to {self.scale}, but no fields specified"
540
  for key in self.scaled_fields:
541
+ assert (
542
+ key in result
543
+ ), f"Trying to scale field '{key}' which is not in results of metrics: {result}"
544
  if isinstance(result[key], list):
545
  assert all(
546
  isinstance(v, float) for v in result[key]
547
  ), "Not all scaled field '{key}' values are floats: {result[key]}"
548
  result[key] = [v / self.scale for v in result[key]]
549
  else:
550
+ assert isinstance(
551
+ result[key], float
552
+ ), "Scaled field '{key}' is not float: {result[key]}"
553
  result[key] /= self.scale
554
  return result
555
 
 
564
  super().prepare()
565
  self.metric = evaluate.load(self.hf_metric_name)
566
 
567
+ def compute(
568
+ self,
569
+ references: List[List[str]],
570
+ predictions: List[str],
571
+ additional_inputs: List[Any],
572
+ ) -> List[Dict[str, Any]]:
573
+ scores = self.metric.compute(
574
+ predictions=predictions, references=references, **self.hf_compute_args
575
+ )
576
 
577
  # convert dict of lists to a list of dicts
578
  results = [{} for _ in range(len(scores[self.hf_metric_fields[0]]))]
 
591
  metric = "f1"
592
 
593
  def prepare(self):
594
+ super().prepare()
595
  self._metric = evaluate.load(self.metric)
596
 
597
  def get_str_id(self, str):
 
601
  self.id_to_str[id] = str
602
  return self.str_to_id[str]
603
 
604
+ def compute(
605
+ self,
606
+ references: List[List[str]],
607
+ predictions: List[str],
608
+ additional_inputs: List[Dict],
609
+ ) -> dict:
610
  assert all(
611
  len(reference) == 1 for reference in references
612
  ), "Only a single reference per prediction is allowed in F1 metric"
613
  self.str_to_id = {}
614
  self.id_to_str = {}
615
+ formatted_references = [
616
+ self.get_str_id(reference[0]) for reference in references
617
+ ]
618
+ self.str_to_id.keys()
619
+ formatted_predictions = [
620
+ self.get_str_id(prediction) for prediction in predictions
621
+ ]
622
  labels = list(set(formatted_references))
623
  result = self._metric.compute(
624
+ predictions=formatted_predictions,
625
+ references=formatted_references,
626
+ labels=labels,
627
+ average=self.average,
628
  )
629
  if isinstance(result["f1"], numpy.ndarray):
630
  from statistics import mean
 
646
  main_score = "f1_macro"
647
 
648
 
649
+ class F1Weighted(F1):
650
+ main_score = "f1_weighted"
651
+ average = "weighted"
652
+
653
+
654
  class F1MultiLabel(GlobalMetric):
655
  _metric = None
656
  main_score = "f1_macro"
 
658
  classes_to_ignore = ["none"]
659
 
660
  def prepare(self):
661
+ super().prepare()
662
  self._metric = evaluate.load("f1", "multilabel")
663
 
664
  def add_str_to_id(self, str):
665
+ if str not in self.str_to_id:
666
  id = len(self.str_to_id)
667
  self.str_to_id[str] = id
668
  self.id_to_str[id] = str
 
675
  result[self.str_to_id[label]] = 1
676
  return result
677
 
678
+ def compute(
679
+ self,
680
+ references: List[List[str]],
681
+ predictions: List[List[str]],
682
+ additional_inputs: List[Dict],
683
+ ) -> dict:
684
  self.str_to_id = {}
685
  self.id_to_str = {}
686
  assert all(
687
  len(reference) == 1 for reference in references
688
+ ), "Only a single reference per prediction is allowed in F1 multi label metric"
689
+
690
  references = [reference[0] for reference in references]
691
+
692
+ for reference in references:
693
+ assert isinstance(
694
+ references, list
695
+ ), f"Each reference is expected to list of strings in F1 multi label metric. Received reference: {reference}"
696
+
697
+ for prediction in predictions:
698
+ assert isinstance(
699
+ prediction, list
700
+ ), f"Each prediction is expected to list of strings in F1 multi label metric. Received prediction: {prediction}"
701
+
702
  labels = [
703
+ lbl
704
+ for lbl in {label for reference in references for label in reference}
705
+ if lbl not in self.classes_to_ignore
706
  ]
707
  # if no classes are left then F1 is not defined
708
  # (e.g. only "none" in references)
 
711
 
712
  for label in labels:
713
  self.add_str_to_id(label)
714
+ formatted_references = [
715
+ self.get_one_hot_vector(reference) for reference in references
716
+ ]
717
+ formatted_predictions = [
718
+ self.get_one_hot_vector(prediction) for prediction in predictions
719
+ ]
720
 
721
  # There is odd behavior in scikit-learn that when passing a one-hot vector with a single
722
  # element, it is treated a class identifier. Therefore, we add labels=[1] to limit to only
 
767
  sent_split_newline: bool = True
768
 
769
  def prepare(self):
 
 
770
  super().prepare()
771
+
772
+ self.hf_compute_args.update(
773
+ {"use_aggregator": self.use_aggregator, "rouge_types": self.rouge_types}
774
+ )
775
+
776
  import nltk
777
 
778
  nltk.download("punkt")
779
  self.sent_tokenize = nltk.sent_tokenize
780
 
781
+ def compute(self, references, predictions, additional_inputs: List[Dict]):
782
  if self.sent_split_newline:
783
+ predictions = [
784
+ "\n".join(self.sent_tokenize(prediction.strip()))
785
+ for prediction in predictions
786
+ ]
787
+ references = [
788
+ ["\n".join(self.sent_tokenize(r.strip())) for r in reference]
789
+ for reference in references
790
+ ]
791
+ return super().compute(references, predictions, additional_inputs)
792
+
793
+
794
+ # Computes char edit distance, ignoring whitespace
795
+ class CharEditDistanceAccuracy(InstanceMetric):
796
  reduction_map = {"mean": ["char_edit_dist_accuracy"]}
797
  main_score = "char_edit_dist_accuracy"
798
 
799
  def prepare(self):
800
+ super().prepare()
801
  import editdistance
802
 
803
  self.eval = editdistance.eval
804
 
805
+ def compute(
806
+ self, references, prediction: str, additional_inputs: List[Dict]
807
+ ) -> dict:
808
+ assert (
809
+ len(references) == 1
810
+ ), f"Expected only one reference , but received: {references}"
811
+
812
  formatted_prediction = "".join(prediction.split())
813
+ formatted_reference = "".join(references[0].split())
814
  max_length = max(len(formatted_reference), len(formatted_prediction))
815
  if max_length == 0:
816
+ return {"char_edit_dist_accuracy": 0.0}
817
  edit_dist = self.eval(formatted_reference, formatted_prediction)
818
  return {"char_edit_dist_accuracy": (1 - edit_dist / max_length)}
819
 
 
822
  hf_metric_name = "wer"
823
  main_score = "wer"
824
 
825
+ def compute(
826
+ self,
827
+ references: List[List[str]],
828
+ predictions: List[str],
829
+ additional_inputs: List[Dict],
830
+ ) -> dict:
831
  assert all(
832
  len(reference) == 1 for reference in references
833
  ), "Only single reference per prediction is allowed in wer metric"
834
  formatted_references = [reference[0] for reference in references]
835
+ result = self.metric.compute(
836
+ predictions=predictions, references=formatted_references
837
+ )
838
  return {self.main_score: result}
839
 
840
 
 
849
  self.str_to_id[str] = id
850
  return self.str_to_id[str]
851
 
852
+ def compute(
853
+ self,
854
+ references: List[List[str]],
855
+ predictions: List[str],
856
+ additional_inputs: List[Dict],
857
+ ) -> dict:
858
+ formatted_references = [
859
+ self.get_str_id(reference[0]) for reference in references
860
+ ]
861
+ formatted_predictions = [
862
+ self.get_str_id(prediction) for prediction in predictions
863
+ ]
864
+ return self.metric.compute(
865
+ predictions=formatted_predictions, references=formatted_references
866
+ )
867
 
868
 
869
  class CustomF1(GlobalMetric):
870
  main_score = "f1_micro"
871
  classes = None
872
+ zero_division = 0.0
873
 
874
  @abstractmethod
875
  def get_element_group(self, element):
 
879
  def get_element_representation(self, element):
880
  pass
881
 
882
+ def group_elements(self, elements_list):
883
  return {
884
+ k: Counter(
885
+ [
886
+ self.get_element_representation(value)
887
+ for value in elements_list
888
+ if self.get_element_group(value) == k
889
+ ]
890
+ )
891
+ for k in {self.get_element_group(e) for e in elements_list}
892
  }
893
 
894
  def calculate_groups_ratio(self, actual_group, total_group):
895
+ return sum(
896
+ [min(actual_group[k], total_group[k]) for k in actual_group.keys()]
897
+ ), sum(actual_group.values())
898
+
899
+ def precision(self, pn, pd, rn, rd):
900
+ return self.zero_division if pn == 0 and pd == 0 else pn / pd
901
+
902
+ def recall(self, pn, pd, rn, rd):
903
+ return self.zero_division if rn == 0 and rd == 0 else rn / rd
904
 
905
  def f1(self, pn, pd, rn, rd):
906
+ precision = self.precision(pn, pd, rn, rd)
907
+ recall = self.recall(pn, pd, rn, rd)
908
  try:
909
  return 2 * precision * recall / (precision + recall)
910
  except ZeroDivisionError:
911
+ return self.zero_division
912
+
913
+ def compute(
914
+ self,
915
+ references: List[Any],
916
+ predictions: List[Any],
917
+ additional_inputs: List[Dict],
918
+ ) -> dict:
919
  # in case reference are List[List[List[Any]]] and predictions are List[List[Any]]:
920
  if isinstance(references[0], list) and isinstance(references[0][0], list):
921
  references = [element[0] for element in references]
922
 
923
  assert len(references) == len(predictions), (
924
+ f"references size ({len(references)})"
925
+ f" doesn't mach predictions sise ({len(references)})."
926
  )
927
  if self.classes is None:
928
+ classes = {
929
+ self.get_element_group(e) for sublist in references for e in sublist
930
+ }
931
  else:
932
  classes = self.classes
933
+ groups_statistics = {}
934
  for references_batch, predictions_batch in zip(references, predictions):
935
  grouped_references = self.group_elements(references_batch)
936
  grouped_predictions = self.group_elements(predictions_batch)
937
+ all_groups = set(grouped_references.keys()).union(
938
+ grouped_predictions.keys()
939
+ )
940
  for group in all_groups:
941
  if group not in groups_statistics:
942
  groups_statistics[group] = {
 
958
  groups_statistics[group]["recall_numerator"] += rn
959
  groups_statistics[group]["recall_denominator"] += rd
960
 
 
961
  num_of_unknown_class_predictions = 0
962
  pn_total = pd_total = rn_total = rd_total = 0
963
+ f1_result = {}
964
+ recall_result = {}
965
+ precision_result = {}
966
  for group in groups_statistics.keys():
967
  pn, pd, rn, rd = (
968
  groups_statistics[group]["precision_numerator"],
 
970
  groups_statistics[group]["recall_numerator"],
971
  groups_statistics[group]["recall_denominator"],
972
  )
973
+ pn_total, pd_total, rn_total, rd_total = (
974
+ pn_total + pn,
975
+ pd_total + pd,
976
+ rn_total + rn,
977
+ rd_total + rd,
978
+ )
979
  if group in classes:
980
+ f1_result[f"f1_{group}"] = self.f1(pn, pd, rn, rd)
981
+ recall_result[f"recall_{group}"] = self.recall(pn, pd, rn, rd)
982
+ precision_result[f"precision_{group}"] = self.precision(pn, pd, rn, rd)
983
  else:
984
  num_of_unknown_class_predictions += pd
985
+
986
+ result = f1_result
987
  try:
988
+ result["f1_macro"] = sum(f1_result.values()) / len(result.keys())
989
+ result["recall_macro"] = sum(recall_result.values()) / len(
990
+ recall_result.keys()
991
+ )
992
+ result["precision_macro"] = sum(precision_result.values()) / len(
993
+ precision_result.keys()
994
+ )
995
  except ZeroDivisionError:
996
+ result["f1_macro"] = self.zero_division
997
+ result["recall_macro"] = self.zero_division
998
+ result["micro_macro"] = self.zero_division
999
 
1000
  amount_of_predictions = pd_total
1001
  if amount_of_predictions == 0:
1002
  result["in_classes_support"] = 1.0
1003
  else:
1004
+ result["in_classes_support"] = (
1005
+ 1.0 - num_of_unknown_class_predictions / amount_of_predictions
1006
+ )
1007
+ result["f1_micro"] = self.f1(pn_total, pd_total, rn_total, rd_total)
1008
+ result["recall_micro"] = self.recall(pn_total, pd_total, rn_total, rd_total)
1009
+ result["precision_micro"] = self.precision(
1010
+ pn_total, pd_total, rn_total, rd_total
1011
+ )
1012
  return result
1013
 
1014
 
 
1043
  reduction_map = {"mean": ["f1", "precision", "recall"]}
1044
  main_score = "f1"
1045
 
1046
+ def compute(
1047
+ self, references: List[Any], prediction: Any, additional_inputs: List[Dict]
1048
+ ) -> dict:
1049
+ results = [
1050
+ self._compute_single_ref(reference, prediction) for reference in references
1051
+ ]
1052
+ return {
1053
+ measure: max(r[i] for r in results)
1054
+ for i, measure in enumerate(["precision", "recall", "f1"])
1055
+ }
1056
 
1057
+ def _compute_single_ref(
1058
+ self, reference: Any, prediction: Any
1059
+ ) -> Tuple[float, float, float]:
1060
  prediction_tokens = normalize_answer(prediction).split()
1061
  reference_tokens = normalize_answer(reference).split()
1062
  common = Counter(prediction_tokens) & Counter(reference_tokens)
 
1097
  self.model = SentenceTransformer(self.model_name)
1098
  self.util = sbert_util
1099
 
1100
+ def compute(
1101
+ self,
1102
+ references: List[List[Any]],
1103
+ predictions: List[Any],
1104
+ additional_inputs: List[Dict],
1105
+ ) -> List[Any]:
1106
  scores = []
1107
 
1108
  # we are in a multi-reference case (each prediction may have multiple
 
1117
 
1118
  # compute s-bert embeddings
1119
  preds_emb = self.model.encode(predictions)
1120
+ refs_emb = self.model.encode(
1121
+ [ref for ref_group in references for ref in ref_group]
1122
+ )
1123
 
1124
  # for each candidate, pick the reference with the highest score
1125
  for pred_emb, ref_group_bounds in zip(preds_emb, ref_group_boundaries):
 
1137
  model_name: str
1138
 
1139
  def prepare(self):
1140
+ super().prepare()
1141
  from transformers import pipeline
1142
 
1143
  self.pipe = pipeline("text-classification", model=self.model_name)
1144
 
1145
+ def compute(
1146
+ self,
1147
+ references: List[List[Any]],
1148
+ predictions: List[Any],
1149
+ additional_inputs: List[Dict],
1150
+ ) -> List[Any]:
1151
  # treat the references as the questions and the predictions as answers
1152
  # assume a single reference
1153
  questions = [refs[0] for refs in references]
 
1159
  # compute the metric
1160
  # add function_to_apply="none" to disable sigmoid
1161
  return self.pipe(inputs, batch_size=self.batch_size)
1162
+
1163
+
1164
+ class NDCG(GlobalMetric):
1165
+ """Normalized Discounted Cumulative Gain: measures the quality of ranking with respect to ground truth ranking scores.
1166
+
1167
+ As this measures ranking, it is a global metric that can only be calculated over groups of instances. In the
1168
+ common use case where the instances are grouped by different queries, i.e., where the task is to provide a
1169
+ relevance score for a search result w.r.t. a query, an nDCG score is calculated per each query (specified in the
1170
+ "query" input field of an instance) and the final score is the average across all queries.
1171
+ Note that the expected scores are relevance scores (i.e., higher is better) and not rank indices. The absolute
1172
+ value of the scores is only meaningful for the reference scores; for the predictions, only the ordering of the
1173
+ scores affects the outcome - for example, predicted scores of [80, 1, 2] and [0.8, 0.5, 0.6] will receive
1174
+ the same nDCG score w.r.t. a given set of reference scores.
1175
+
1176
+ See also https://en.wikipedia.org/wiki/Discounted_cumulative_gain
1177
+ """
1178
+
1179
+ main_score = "nDCG"
1180
+
1181
+ def prepare(self):
1182
+ from sklearn.metrics import ndcg_score
1183
+
1184
+ super().prepare()
1185
+ self.eval = ndcg_score
1186
+
1187
+ def compute(
1188
+ self,
1189
+ references: List[List[Any]],
1190
+ predictions: List[Any],
1191
+ additional_inputs: List[Any],
1192
+ ) -> dict:
1193
+ from collections import defaultdict
1194
+ from statistics import mean
1195
+
1196
+ query_to_predictions_and_references = defaultdict(lambda: [[], []])
1197
+ for reference, pred, inputs_dict in zip(
1198
+ references, predictions, additional_inputs
1199
+ ):
1200
+ query = inputs_dict.get("query")
1201
+ query_to_predictions_and_references[query][0].append(pred)
1202
+ query_to_predictions_and_references[query][1].append(reference)
1203
+
1204
+ scores = []
1205
+ for q_predictions, q_references in query_to_predictions_and_references.values():
1206
+ if len(q_references) == 1:
1207
+ continue
1208
+
1209
+ if (
1210
+ None in q_predictions
1211
+ ): # model failed to predict numeric scores for some instances
1212
+ numeric_predictions = [
1213
+ pred for pred in q_predictions if pred is not None
1214
+ ]
1215
+ if len(numeric_predictions) <= 1: # no meaningful ranking
1216
+ scores.append(0)
1217
+ continue
1218
+ # consider non-numeric model predictions as ranked last
1219
+ min_value = min(numeric_predictions)
1220
+ q_predictions = [
1221
+ 1 + (pred - min_value) if pred is not None else 0
1222
+ for pred in q_predictions
1223
+ ]
1224
+ scores.append(self.eval([q_references], [q_predictions]))
1225
+ return {self.main_score: mean(scores) if len(scores) > 0 else np.nan}