Elron commited on
Commit
0badbfa
1 Parent(s): 08d3a7d

Upload metrics.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. metrics.py +229 -31
metrics.py CHANGED
@@ -1,13 +1,15 @@
 
 
1
  import uuid
2
  from abc import ABC, abstractmethod
3
  from collections import Counter
4
  from dataclasses import field
5
- from typing import Any, Dict, Generator, List, Optional
6
 
7
  import evaluate
8
  import numpy
9
 
10
- from .dataclass import InternalField
11
  from .operator import (
12
  MultiStreamOperator,
13
  SingleStreamOperator,
@@ -91,6 +93,61 @@ class GlobalMetric(SingleStreamOperator, Metric):
91
  pass
92
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  class InstanceMetric(SingleStreamOperator, Metric):
95
  implemented_reductions: List[str] = field(default_factory=lambda: ["mean"])
96
 
@@ -134,8 +191,8 @@ class InstanceMetric(SingleStreamOperator, Metric):
134
  for instance in instances:
135
  yield instance
136
 
137
- def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
138
- result = self.compute(references=references, predictions=predictions)
139
  result["score"] = result[self.main_score]
140
  result["score_name"] = self.main_score
141
  return result
@@ -217,24 +274,62 @@ class MetricPipeline(MultiStreamOperator, Metric):
217
 
218
 
219
  class HuggingfaceMetric(GlobalMetric):
220
- metric_name: str = None
221
- main_score: str = None
222
- scale: float = 1.0
223
- hf_compute_args: dict = {}
 
 
 
 
224
 
225
  def prepare(self):
226
  super().prepare()
227
- self.metric = evaluate.load(self.metric_name)
228
 
229
  def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
230
  result = self.metric.compute(predictions=predictions, references=references, **self.hf_compute_args)
 
 
 
231
  if self.scale != 1.0:
232
- for key in result:
233
- if isinstance(result[key], float):
 
 
 
 
 
 
 
 
234
  result[key] /= self.scale
235
  return result
236
 
237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  class F1(GlobalMetric):
239
  _metric = None
240
  main_score = "f1_macro"
@@ -370,7 +465,7 @@ class F1MacroMultiLabel(F1MultiLabel):
370
 
371
 
372
  class Rouge(HuggingfaceMetric):
373
- metric_name = "rouge"
374
  main_score = "rougeL"
375
  scale = 1.0
376
 
@@ -380,7 +475,7 @@ class Rouge(HuggingfaceMetric):
380
  sent_split_newline: bool = True
381
 
382
  def prepare(self):
383
- self.hf_compute_args = {"use_aggregator": self.use_aggregator, "rouge_types": self.rouge_types}
384
 
385
  super().prepare()
386
  import nltk
@@ -416,13 +511,9 @@ class CharEditDistanceAccuracy(SingleReferenceInstanceMetric):
416
 
417
 
418
  class Wer(HuggingfaceMetric):
419
- metric_name = "wer"
420
  main_score = "wer"
421
 
422
- def prepare(self):
423
- super().prepare()
424
- self.metric = evaluate.load(self.metric_name)
425
-
426
  def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
427
  assert all(
428
  len(reference) == 1 for reference in references
@@ -432,20 +523,8 @@ class Wer(HuggingfaceMetric):
432
  return {self.main_score: result}
433
 
434
 
435
- class Bleu(HuggingfaceMetric):
436
- metric_name = "bleu"
437
- main_score = "bleu"
438
- scale = 1.0
439
-
440
-
441
- class SacreBleu(HuggingfaceMetric):
442
- metric_name = "sacrebleu"
443
- main_score = "score"
444
- scale = 1.0
445
-
446
-
447
  class MatthewsCorrelation(HuggingfaceMetric):
448
- metric_name = "matthews_correlation"
449
  main_score = "matthews_correlation"
450
  str_to_id: dict = InternalField(default_factory=dict)
451
 
@@ -564,3 +643,122 @@ class NER(CustomF1):
564
 
565
  def get_element_representation(self, element):
566
  return str(element)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
  import uuid
4
  from abc import ABC, abstractmethod
5
  from collections import Counter
6
  from dataclasses import field
7
+ from typing import Any, Dict, Generator, List, Optional, Tuple
8
 
9
  import evaluate
10
  import numpy
11
 
12
+ from .dataclass import InternalField, OptionalField
13
  from .operator import (
14
  MultiStreamOperator,
15
  SingleStreamOperator,
 
93
  pass
94
 
95
 
96
+ class BulkInstanceMetric(SingleStreamOperator, Metric):
97
+ main_score: str
98
+ reduction_map: Dict[str, List[str]]
99
+
100
+ implemented_reductions: List[str] = field(default_factory=lambda: ["mean"])
101
+
102
+ def process(self, stream: Stream, stream_name: str = None) -> Generator:
103
+ global_score = {}
104
+ instances = []
105
+
106
+ # consume the stream
107
+ references, predictions = map(
108
+ list, zip(*[(instance["references"], instance["prediction"]) for instance in stream])
109
+ )
110
+
111
+ # compute the metric over all refs and preds
112
+ instance_scores = self.compute(references=references, predictions=predictions)
113
+
114
+ # add the score and score_name fields
115
+ for instance_score in instance_scores:
116
+ instance_score["score"] = instance_score[self.main_score]
117
+ instance_score["score_name"] = self.main_score
118
+
119
+ for instance, score in zip(stream, instance_scores):
120
+ if "score" not in instance:
121
+ instance["score"] = {"global": global_score, "instance": {}}
122
+ else:
123
+ global_score = instance["score"]["global"]
124
+
125
+ instance["score"]["instance"].update(score)
126
+
127
+ instances.append(instance)
128
+
129
+ for reduction, fields in self.reduction_map.items():
130
+ assert (
131
+ reduction in self.implemented_reductions
132
+ ), f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
133
+
134
+ if reduction == "mean":
135
+ from statistics import mean
136
+
137
+ for field in fields:
138
+ global_score[field] = mean([instance["score"]["instance"][field] for instance in instances])
139
+ if field == self.main_score:
140
+ global_score["score"] = global_score[field]
141
+ global_score["score_name"] = self.main_score
142
+
143
+ for instance in instances:
144
+ yield instance
145
+
146
+ @abstractmethod
147
+ def compute(self, references: List[List[Any]], predictions: List[Any]) -> Dict[str, Any]:
148
+ pass
149
+
150
+
151
  class InstanceMetric(SingleStreamOperator, Metric):
152
  implemented_reductions: List[str] = field(default_factory=lambda: ["mean"])
153
 
 
191
  for instance in instances:
192
  yield instance
193
 
194
+ def _compute(self, references: List[str], prediction: str) -> dict:
195
+ result = self.compute(references=references, prediction=prediction)
196
  result["score"] = result[self.main_score]
197
  result["score_name"] = self.main_score
198
  return result
 
274
 
275
 
276
  class HuggingfaceMetric(GlobalMetric):
277
+ hf_metric_name: str = None
278
+ main_score: str = None # The main score returned from the metric
279
+ hf_main_score: str = None # USed if HF returns uses a different score name for the main metric
280
+
281
+ scale: float = 1.0 # optional scaling of main results
282
+ scaled_fields: list = None
283
+ hf_compute_args: Dict[str, Any] = OptionalField(default_factory=dict)
284
+ experiment_id: str = OptionalField(default_factory=lambda: str(uuid.uuid4()))
285
 
286
  def prepare(self):
287
  super().prepare()
288
+ self.metric = evaluate.load(self.hf_metric_name, experiment_id=self.experiment_id)
289
 
290
  def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
291
  result = self.metric.compute(predictions=predictions, references=references, **self.hf_compute_args)
292
+ if self.hf_main_score:
293
+ result[self.main_score] = result[self.hf_main_score]
294
+ del result[self.hf_main_score]
295
  if self.scale != 1.0:
296
+ assert self.scaled_fields is not None, f"Scaling factor was set to {self.scale}, but no fields specified"
297
+ for key in self.scaled_fields:
298
+ assert key in result, f"Trying to scale field '{key}' which is not in results of metrics: {result}"
299
+ if isinstance(result[key], list):
300
+ assert all(
301
+ isinstance(v, float) for v in result[key]
302
+ ), "Not all scaled field '{key}' values are floats: {result[key]}"
303
+ result[key] = [v / self.scale for v in result[key]]
304
+ else:
305
+ assert isinstance(result[key], float), "Scaled field '{key}' is not float: {result[key]}"
306
  result[key] /= self.scale
307
  return result
308
 
309
 
310
+ class HuggingfaceBulkMetric(BulkInstanceMetric):
311
+ hf_metric_name: str
312
+
313
+ hf_metric_fields: List[str]
314
+ hf_compute_args: dict = {}
315
+
316
+ def prepare(self):
317
+ super().prepare()
318
+ self.metric = evaluate.load(self.hf_metric_name)
319
+
320
+ def compute(self, references: List[List[str]], predictions: List[str]) -> List[Dict[str, Any]]:
321
+ scores = self.metric.compute(predictions=predictions, references=references, **self.hf_compute_args)
322
+
323
+ # convert dict of lists to a list of dicts
324
+ results = [{} for _ in range(len(scores[self.hf_metric_fields[0]]))]
325
+ for key in self.hf_metric_fields:
326
+ values = scores[key]
327
+ for result_id, result in enumerate(results):
328
+ result[key] = values[result_id]
329
+
330
+ return results
331
+
332
+
333
  class F1(GlobalMetric):
334
  _metric = None
335
  main_score = "f1_macro"
 
465
 
466
 
467
  class Rouge(HuggingfaceMetric):
468
+ hf_metric_name = "rouge"
469
  main_score = "rougeL"
470
  scale = 1.0
471
 
 
475
  sent_split_newline: bool = True
476
 
477
  def prepare(self):
478
+ self.hf_compute_args.update({"use_aggregator": self.use_aggregator, "rouge_types": self.rouge_types})
479
 
480
  super().prepare()
481
  import nltk
 
511
 
512
 
513
  class Wer(HuggingfaceMetric):
514
+ hf_metric_name = "wer"
515
  main_score = "wer"
516
 
 
 
 
 
517
  def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
518
  assert all(
519
  len(reference) == 1 for reference in references
 
523
  return {self.main_score: result}
524
 
525
 
 
 
 
 
 
 
 
 
 
 
 
 
526
  class MatthewsCorrelation(HuggingfaceMetric):
527
+ hf_metric_name = "matthews_correlation"
528
  main_score = "matthews_correlation"
529
  str_to_id: dict = InternalField(default_factory=dict)
530
 
 
643
 
644
  def get_element_representation(self, element):
645
  return str(element)
646
+
647
+
648
+ def normalize_answer(s):
649
+ """Lower text and remove punctuation, articles and extra whitespace."""
650
+
651
+ def remove_articles(text):
652
+ return re.sub(r"\b(a|an|the)\b", " ", text)
653
+
654
+ def white_space_fix(text):
655
+ return " ".join(text.split())
656
+
657
+ def remove_punc(text):
658
+ exclude = set(string.punctuation)
659
+ return "".join(ch for ch in text if ch not in exclude)
660
+
661
+ def lower(text):
662
+ return text.lower()
663
+
664
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
665
+
666
+
667
+ class TokenOverlap(InstanceMetric):
668
+ reduction_map = {"mean": ["f1", "precision", "recall"]}
669
+ main_score = "f1"
670
+
671
+ def compute(self, references: List[Any], prediction: Any) -> dict:
672
+ results = [self._compute_single_ref(reference, prediction) for reference in references]
673
+ return {measure: max(r[i] for r in results) for i, measure in enumerate(["precision", "recall", "f1"])}
674
+
675
+ def _compute_single_ref(self, reference: Any, prediction: Any) -> Tuple[float, float, float]:
676
+ prediction_tokens = normalize_answer(prediction).split()
677
+ reference_tokens = normalize_answer(reference).split()
678
+ common = Counter(prediction_tokens) & Counter(reference_tokens)
679
+ num_same = sum(common.values())
680
+ if num_same == 0:
681
+ pr, rc, f1 = 0, 0, 0
682
+ else:
683
+ pr = 1.0 * num_same / len(prediction_tokens)
684
+ rc = 1.0 * num_same / len(reference_tokens)
685
+ f1 = (2 * pr * rc) / (pr + rc)
686
+ return pr, rc, f1
687
+
688
+
689
+ class BertScore(HuggingfaceBulkMetric):
690
+ hf_metric_name = "bertscore"
691
+ main_score = "f1"
692
+ reduction_map = {"mean": ["f1", "precision", "recall"]}
693
+ hf_metric_fields = ["f1", "precision", "recall"]
694
+ model_name: str
695
+
696
+ def prepare(self):
697
+ super().prepare()
698
+ self.hf_compute_args = {"model_type": self.model_name}
699
+
700
+
701
+ class SentenceBert(BulkInstanceMetric):
702
+ reduction_map = {"mean": ["score"]}
703
+ main_score = "score"
704
+ batch_size: int = 32
705
+
706
+ model_name: str
707
+
708
+ def prepare(self):
709
+ super().prepare()
710
+ from sentence_transformers import SentenceTransformer
711
+ from sentence_transformers import util as sbert_util
712
+
713
+ self.model = SentenceTransformer(self.model_name)
714
+ self.util = sbert_util
715
+
716
+ def compute(self, references: List[List[Any]], predictions: List[Any]) -> List[Any]:
717
+ scores = []
718
+
719
+ # we are in a multi-reference case (each prediction may have multiple
720
+ # references), so we need to flatten the refs in order to compute the
721
+ # embeddings in one batch, but first we have to store the spans of
722
+ # reference groups, so we can recover it later on.
723
+ ref_group_boundaries = []
724
+ count = 0
725
+ for ref_group in references:
726
+ ref_group_boundaries.append((count, count + len(ref_group)))
727
+ count += len(ref_group)
728
+
729
+ # compute s-bert embeddings
730
+ preds_emb = self.model.encode(predictions)
731
+ refs_emb = self.model.encode([ref for ref_group in references for ref in ref_group])
732
+
733
+ # for each candidate, pick the reference with the highest score
734
+ for pred_emb, ref_group_bounds in zip(preds_emb, ref_group_boundaries):
735
+ refs_group_emb = refs_emb[ref_group_bounds[0] : ref_group_bounds[1]]
736
+ scores.append(self.util.cos_sim(pred_emb, refs_group_emb).max().item())
737
+
738
+ return [{"score": score} for score in scores]
739
+
740
+
741
+ class Reward(BulkInstanceMetric):
742
+ reduction_map = {"mean": ["score"]}
743
+ main_score = "score"
744
+ batch_size: int = 32
745
+
746
+ model_name: str
747
+
748
+ def prepare(self):
749
+ from transformers import pipeline
750
+
751
+ self.pipe = pipeline("text-classification", model=self.model_name)
752
+
753
+ def compute(self, references: List[List[Any]], predictions: List[Any]) -> List[Any]:
754
+ # treat the references as the questions and the predictions as answers
755
+ # assume a single reference
756
+ questions = [refs[0] for refs in references]
757
+ answers = predictions
758
+
759
+ # prepare for computation
760
+ inputs = [{"text": q, "text_pair": a} for q, a in zip(questions, answers)]
761
+
762
+ # compute the metric
763
+ # add function_to_apply="none" to disable sigmoid
764
+ return self.pipe(inputs, batch_size=self.batch_size)