pminervini commited on
Commit
9563130
1 Parent(s): 73d1e6e
src/backend/tasks/xsum/task.py CHANGED
@@ -3,7 +3,51 @@ from lm_eval.api.instance import Instance
3
  from lm_eval.api.registry import register_task
4
  from lm_eval.api.metrics import mean
5
 
6
- from src.backend.tasks.xsum import utils
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  @register_task("xsum")
@@ -14,7 +58,14 @@ class XSum(Task):
14
 
15
  def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
16
  super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
17
- print('XXX XSum!')
 
 
 
 
 
 
 
18
 
19
  def has_training_docs(self):
20
  return True
@@ -70,7 +121,31 @@ class XSum(Task):
70
  ]
71
 
72
  def process_results(self, doc, results):
73
- return utils.process_results(doc, results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  def aggregation(self):
76
  """
@@ -87,4 +162,3 @@ class XSum(Task):
87
  whether a higher value of the submetric is better
88
  """
89
  return {k: True for k in ["rouge1", "rouge2", "rougeL"]}
90
-
 
3
  from lm_eval.api.registry import register_task
4
  from lm_eval.api.metrics import mean
5
 
6
+ import sacrebleu
7
+ from rouge_score import rouge_scorer, scoring
8
+
9
+
10
+ def bleu(refs, preds):
11
+ """
12
+ Returns `t5` style BLEU scores. See the related implementation:
13
+ https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
14
+
15
+ :param refs:
16
+ A `list` of `list` of reference `str`s.
17
+ :param preds:
18
+ A `list` of predicted `str`s.
19
+ """
20
+ score = sacrebleu.corpus_bleu(preds, refs, smooth_method="exp", smooth_value=0.0, force=False,
21
+ lowercase=False, tokenize="intl", use_effective_order=False).score
22
+ return score
23
+
24
+
25
+ def rouge(refs, preds):
26
+ """
27
+ Returns `t5` style ROUGE scores. See the related implementation:
28
+ https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
29
+
30
+ :param refs:
31
+ A `list` of reference `strs`.
32
+ :param preds:
33
+ A `list` of predicted `strs`.
34
+ """
35
+ rouge_types = ["rouge1", "rouge2", "rougeLsum"]
36
+ scorer = rouge_scorer.RougeScorer(rouge_types)
37
+ # Add newlines between sentences to correctly compute `rougeLsum`.
38
+
39
+ def _prepare_summary(summary):
40
+ summary = summary.replace(" . ", ".\n")
41
+ return summary
42
+
43
+ # Accumulate confidence intervals.
44
+ aggregator = scoring.BootstrapAggregator()
45
+ for ref, pred in zip(refs, preds):
46
+ ref = _prepare_summary(ref)
47
+ pred = _prepare_summary(pred)
48
+ aggregator.add_scores(scorer.score(ref, pred))
49
+ result = aggregator.aggregate()
50
+ return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
51
 
52
 
53
  @register_task("xsum")
 
58
 
59
  def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
60
  super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
61
+ self.factkb_tokenizer = None
62
+ self.factkb_model = None
63
+
64
+ def init_factkb(self):
65
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
66
+
67
+ self.factkb_tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True)
68
+ self.factkb_model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", num_labels=2)
69
 
70
  def has_training_docs(self):
71
  return True
 
121
  ]
122
 
123
  def process_results(self, doc, results):
124
+ completion = results[0]
125
+
126
+ # document = doc["document"]
127
+ true_refs = [doc["summary"]]
128
+ all_refs = true_refs
129
+
130
+ # ROUGE-N
131
+ rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
132
+
133
+ # ROUGE-1
134
+ rouge1_scores = [score["rouge1"] for score in rouge_scores]
135
+
136
+ # ROUGE-2
137
+ rouge2_scores = [score["rouge2"] for score in rouge_scores]
138
+
139
+ # ROUGE-L
140
+ rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
141
+
142
+ res = {
143
+ "rouge1": rouge1_scores[0],
144
+ "rouge2": rouge2_scores[0],
145
+ "rougeL": rougeL_scores[0],
146
+ }
147
+
148
+ return res
149
 
150
  def aggregation(self):
151
  """
 
162
  whether a higher value of the submetric is better
163
  """
164
  return {k: True for k in ["rouge1", "rouge2", "rougeL"]}
 
src/backend/tasks/xsum/utils.py DELETED
@@ -1,89 +0,0 @@
1
- import sacrebleu
2
- import numpy as np
3
-
4
- from rouge_score import rouge_scorer, scoring
5
-
6
-
7
- def process_results(doc, results):
8
- # (Pdb)doc.keys()
9
- # dict_keys(['document', 'summary', 'id'])
10
- # (Pdb++) results
11
- # [' The Welsh Government has announced
12
-
13
- # breakpoint()
14
-
15
- completion = results[0]
16
- # true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
17
- # all_refs = true_refs + false_refs
18
-
19
- document = doc["document"]
20
- true_refs = [doc["summary"]]
21
- all_refs = true_refs
22
-
23
- # ROUGE-N
24
- rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
25
- # ROUGE-1
26
- rouge1_scores = [score["rouge1"] for score in rouge_scores]
27
- # ROUGE-2
28
- rouge2_scores = [score["rouge2"] for score in rouge_scores]
29
- # ROUGE-L
30
- rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
31
-
32
- res = {
33
- "rouge1": rouge1_scores[0],
34
- "rouge2": rouge2_scores[0],
35
- "rougeL": rougeL_scores[0],
36
- }
37
-
38
- return res
39
-
40
-
41
- def bleu(refs, preds):
42
- """
43
- Returns `t5` style BLEU scores. See the related implementation:
44
- https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
45
-
46
- :param refs:
47
- A `list` of `list` of reference `str`s.
48
- :param preds:
49
- A `list` of predicted `str`s.
50
- """
51
- score = sacrebleu.corpus_bleu(
52
- preds,
53
- refs,
54
- smooth_method="exp",
55
- smooth_value=0.0,
56
- force=False,
57
- lowercase=False,
58
- tokenize="intl",
59
- use_effective_order=False,
60
- ).score
61
- return score
62
-
63
-
64
- def rouge(refs, preds):
65
- """
66
- Returns `t5` style ROUGE scores. See the related implementation:
67
- https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
68
-
69
- :param refs:
70
- A `list` of reference `strs`.
71
- :param preds:
72
- A `list` of predicted `strs`.
73
- """
74
- rouge_types = ["rouge1", "rouge2", "rougeLsum"]
75
- scorer = rouge_scorer.RougeScorer(rouge_types)
76
- # Add newlines between sentences to correctly compute `rougeLsum`.
77
-
78
- def _prepare_summary(summary):
79
- summary = summary.replace(" . ", ".\n")
80
- return summary
81
-
82
- # Accumulate confidence intervals.
83
- aggregator = scoring.BootstrapAggregator()
84
- for ref, pred in zip(refs, preds):
85
- ref = _prepare_summary(ref)
86
- pred = _prepare_summary(pred)
87
- aggregator.add_scores(scorer.score(ref, pred))
88
- result = aggregator.aggregate()
89
- return {type: result[type].mid.fmeasure * 100 for type in rouge_types}