HalteroXHunter commited on
Commit
c418edf
·
1 Parent(s): 553023f

add bleurt and bertscore

Browse files
Files changed (1) hide show
  1. generation_evaluator.py +57 -4
generation_evaluator.py CHANGED
@@ -32,6 +32,21 @@ _CITATION = """\
32
  publisher = "COLING",
33
  url = "https://www.aclweb.org/anthology/C04-1072",
34
  pages = "501--507",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  """
36
 
37
  _DESCRIPTION = """\
@@ -54,6 +69,18 @@ Neither intelligibility nor grammatical correctness are not taken into account.
54
 
55
  EXACT MATCH: Returns the rate at which the input predicted strings exactly match their references, ignoring any strings input as part of the regexes_to_ignore list.
56
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  """
58
 
59
  _KWARGS_DESCRIPTION = """
@@ -63,7 +90,7 @@ Args:
63
  should be a string with tokens separated by spaces.
64
  references: list of reference for each prediction. Each
65
  reference should be a string with tokens separated by spaces.
66
-
67
  Returns:
68
  ROUGE:{
69
  rouge1: rouge_1 (precision, recall, f1),
@@ -81,9 +108,19 @@ BLEU:{
81
  },
82
  EXACT_MATCH:{
83
  "exact_match": exact_match rate. Possible values are between 0.0 and 1.0, inclusive.
 
 
 
 
 
 
 
 
 
84
  }
85
  """
86
 
 
87
  class GenerationEvaluator(evaluate.Metric):
88
  def _info(self):
89
  return evaluate.MetricInfo(
@@ -116,11 +153,27 @@ class GenerationEvaluator(evaluate.Metric):
116
  bleu_results = bleu_score.compute(
117
  predictions=predictions, references=references
118
  )
119
-
120
  exact_match_score = evaluate.load("exact_match")
121
  exact_match_results = exact_match_score.compute(
122
  predictions=predictions, references=references
123
  )
124
 
125
-
126
- return {"ROUGE": rouge_results, "BLEU": bleu_results, "EXACT_MATCH": exact_match_results}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  publisher = "COLING",
33
  url = "https://www.aclweb.org/anthology/C04-1072",
34
  pages = "501--507",
35
+ \
36
+ @inproceedings{bert-score,
37
+ title={BERTScore: Evaluating Text Generation with BERT},
38
+ author={Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi},
39
+ booktitle={International Conference on Learning Representations},
40
+ year={2020},
41
+ url={https://openreview.net/forum?id=SkeHuCVFDr}
42
+ \
43
+ @inproceedings{bleurt,
44
+ title={BLEURT: Learning Robust Metrics for Text Generation},
45
+ author={Thibault Sellam and Dipanjan Das and Ankur P. Parikh},
46
+ booktitle={ACL},
47
+ year={2020},
48
+ url={https://arxiv.org/abs/2004.04696}
49
+ }
50
  """
51
 
52
  _DESCRIPTION = """\
 
69
 
70
  EXACT MATCH: Returns the rate at which the input predicted strings exactly match their references, ignoring any strings input as part of the regexes_to_ignore list.
71
 
72
+ BERTScore leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference
73
+ sentences by cosine similarity.
74
+ It has been shown to correlate with human judgment on sentence-level and system-level evaluation.
75
+ Moreover, BERTScore computes precision, recall, and F1 measure, which can be useful for evaluating different language
76
+ generation tasks.
77
+ See the project's README at https://github.com/Tiiiger/bert_score#readme for more information.
78
+
79
+ BLEURT a learnt evaluation metric for Natural Language Generation. It is built using multiple phases of transfer learning starting from a pretrained BERT model (Devlin et al. 2018)
80
+ and then employing another pre-training phrase using synthetic data. Finally it is trained on WMT human annotations. You may run BLEURT out-of-the-box or fine-tune
81
+ it for your specific application (the latter is expected to perform better).
82
+ See the project's README at https://github.com/google-research/bleurt#readme for more information.
83
+
84
  """
85
 
86
  _KWARGS_DESCRIPTION = """
 
90
  should be a string with tokens separated by spaces.
91
  references: list of reference for each prediction. Each
92
  reference should be a string with tokens separated by spaces.
93
+
94
  Returns:
95
  ROUGE:{
96
  rouge1: rouge_1 (precision, recall, f1),
 
108
  },
109
  EXACT_MATCH:{
110
  "exact_match": exact_match rate. Possible values are between 0.0 and 1.0, inclusive.
111
+ },
112
+ BERT_SCORE:{
113
+ "precision": Precision.
114
+ "recall": Recall.
115
+ "f1": F1 score.
116
+ "hashcode": Hashcode of the library.
117
+ },
118
+ BLEURT:{
119
+ "scores": List of scores.
120
  }
121
  """
122
 
123
+
124
  class GenerationEvaluator(evaluate.Metric):
125
  def _info(self):
126
  return evaluate.MetricInfo(
 
153
  bleu_results = bleu_score.compute(
154
  predictions=predictions, references=references
155
  )
156
+
157
  exact_match_score = evaluate.load("exact_match")
158
  exact_match_results = exact_match_score.compute(
159
  predictions=predictions, references=references
160
  )
161
 
162
+ bert_score = evaluate.load("bert_score")
163
+ bert_score_results = bert_score.compute(
164
+ predictions=predictions, references=references,
165
+ lang="en"
166
+ )
167
+
168
+ bleurt_score = evaluate.load("bleurt", module_type="metric")
169
+ bleurt_results = bleurt_score.compute(
170
+ predictions=predictions, references=references
171
+ )
172
+
173
+ return {
174
+ "ROUGE": rouge_results,
175
+ "BLEU": bleu_results,
176
+ "EXACT_MATCH": exact_match_results,
177
+ "BERT_SCORE":bert_score_results,
178
+ "BLEURT":bleurt_results
179
+ }