HalteroXHunter commited on
Commit
9ddbb93
·
1 Parent(s): 80d7919

include chrf

Browse files
Files changed (2) hide show
  1. generation_evaluator.py +13 -0
  2. requirements.txt +3 -1
generation_evaluator.py CHANGED
@@ -82,6 +82,9 @@ and then employing another pre-training phrase using synthetic data. Finally it
82
  it for your specific application (the latter is expected to perform better).
83
  See the project's README at https://github.com/google-research/bleurt#readme for more information.
84
 
 
 
 
85
  """
86
 
87
  _KWARGS_DESCRIPTION = """
@@ -118,6 +121,12 @@ BERT_SCORE:{
118
  },
119
  BLEURT:{
120
  "scores": List of scores.
 
 
 
 
 
 
121
  }
122
  """
123
 
@@ -180,6 +189,9 @@ class GenerationEvaluator(evaluate.Metric):
180
 
181
  mean_bleurt_score = np.mean(bleurt_results['scores'])
182
  bleurt_results['scores'] = round(mean_bleurt_score, 4)
 
 
 
183
 
184
  return {
185
  "ROUGE": rouge_results,
@@ -187,4 +199,5 @@ class GenerationEvaluator(evaluate.Metric):
187
  "EXACT_MATCH": exact_match_results,
188
  "BERT_SCORE": bert_score_results,
189
  "BLEURT": bleurt_results,
 
190
  }
 
82
  it for your specific application (the latter is expected to perform better).
83
  See the project's README at https://github.com/google-research/bleurt#readme for more information.
84
 
85
+ ChrF and ChrF++ are two MT evaluation metrics. They both use the F-score statistic for character n-gram matches,
86
+ and ChrF++ adds word n-grams as well which correlates more strongly with direct assessment. We use the implementation
87
+ that is already present in sacrebleu.
88
  """
89
 
90
  _KWARGS_DESCRIPTION = """
 
121
  },
122
  BLEURT:{
123
  "scores": List of scores.
124
+ },
125
+ CHRF:{
126
+ 'score' (float): The chrF (chrF++) score,
127
+ 'char_order' (int): The character n-gram order,
128
+ 'word_order' (int): The word n-gram order. If equals to 2, the metric is referred to as chrF++,
129
+ 'beta' (int): Determine the importance of recall w.r.t precision
130
  }
131
  """
132
 
 
189
 
190
  mean_bleurt_score = np.mean(bleurt_results['scores'])
191
  bleurt_results['scores'] = round(mean_bleurt_score, 4)
192
+
193
+ chrf = evaluate.load("chrf")
194
+ chrf_results = chrf.compute(predictions=predictions, references=references)
195
 
196
  return {
197
  "ROUGE": rouge_results,
 
199
  "EXACT_MATCH": exact_match_results,
200
  "BERT_SCORE": bert_score_results,
201
  "BLEURT": bleurt_results,
202
+ "CHRF": chrf_results
203
  }
requirements.txt CHANGED
@@ -4,4 +4,6 @@ scikit-learn
4
  gradio
5
  bert_score
6
  git+https://github.com/google-research/bleurt.git
7
- numpy
 
 
 
4
  gradio
5
  bert_score
6
  git+https://github.com/google-research/bleurt.git
7
+ numpy
8
+ git+https://github.com/huggingface/evaluate@a4bdc10c48a450b978d91389a48dbb5297835c7d
9
+ sacrebleu