Spaces:
Sleeping
Sleeping
HalteroXHunter
commited on
Commit
·
9ddbb93
1
Parent(s):
80d7919
include chrf
Browse files- generation_evaluator.py +13 -0
- requirements.txt +3 -1
generation_evaluator.py
CHANGED
@@ -82,6 +82,9 @@ and then employing another pre-training phrase using synthetic data. Finally it
|
|
82 |
it for your specific application (the latter is expected to perform better).
|
83 |
See the project's README at https://github.com/google-research/bleurt#readme for more information.
|
84 |
|
|
|
|
|
|
|
85 |
"""
|
86 |
|
87 |
_KWARGS_DESCRIPTION = """
|
@@ -118,6 +121,12 @@ BERT_SCORE:{
|
|
118 |
},
|
119 |
BLEURT:{
|
120 |
"scores": List of scores.
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
}
|
122 |
"""
|
123 |
|
@@ -180,6 +189,9 @@ class GenerationEvaluator(evaluate.Metric):
|
|
180 |
|
181 |
mean_bleurt_score = np.mean(bleurt_results['scores'])
|
182 |
bleurt_results['scores'] = round(mean_bleurt_score, 4)
|
|
|
|
|
|
|
183 |
|
184 |
return {
|
185 |
"ROUGE": rouge_results,
|
@@ -187,4 +199,5 @@ class GenerationEvaluator(evaluate.Metric):
|
|
187 |
"EXACT_MATCH": exact_match_results,
|
188 |
"BERT_SCORE": bert_score_results,
|
189 |
"BLEURT": bleurt_results,
|
|
|
190 |
}
|
|
|
82 |
it for your specific application (the latter is expected to perform better).
|
83 |
See the project's README at https://github.com/google-research/bleurt#readme for more information.
|
84 |
|
85 |
+
ChrF and ChrF++ are two MT evaluation metrics. They both use the F-score statistic for character n-gram matches,
|
86 |
+
and ChrF++ adds word n-grams as well which correlates more strongly with direct assessment. We use the implementation
|
87 |
+
that is already present in sacrebleu.
|
88 |
"""
|
89 |
|
90 |
_KWARGS_DESCRIPTION = """
|
|
|
121 |
},
|
122 |
BLEURT:{
|
123 |
"scores": List of scores.
|
124 |
+
},
|
125 |
+
CHRF:{
|
126 |
+
'score' (float): The chrF (chrF++) score,
|
127 |
+
'char_order' (int): The character n-gram order,
|
128 |
+
'word_order' (int): The word n-gram order. If equals to 2, the metric is referred to as chrF++,
|
129 |
+
'beta' (int): Determine the importance of recall w.r.t precision
|
130 |
}
|
131 |
"""
|
132 |
|
|
|
189 |
|
190 |
mean_bleurt_score = np.mean(bleurt_results['scores'])
|
191 |
bleurt_results['scores'] = round(mean_bleurt_score, 4)
|
192 |
+
|
193 |
+
chrf = evaluate.load("chrf")
|
194 |
+
chrf_results = chrf.compute(predictions=predictions, references=references)
|
195 |
|
196 |
return {
|
197 |
"ROUGE": rouge_results,
|
|
|
199 |
"EXACT_MATCH": exact_match_results,
|
200 |
"BERT_SCORE": bert_score_results,
|
201 |
"BLEURT": bleurt_results,
|
202 |
+
"CHRF": chrf_results
|
203 |
}
|
requirements.txt
CHANGED
@@ -4,4 +4,6 @@ scikit-learn
|
|
4 |
gradio
|
5 |
bert_score
|
6 |
git+https://github.com/google-research/bleurt.git
|
7 |
-
numpy
|
|
|
|
|
|
4 |
gradio
|
5 |
bert_score
|
6 |
git+https://github.com/google-research/bleurt.git
|
7 |
+
numpy
|
8 |
+
git+https://github.com/huggingface/evaluate@a4bdc10c48a450b978d91389a48dbb5297835c7d
|
9 |
+
sacrebleu
|