shounakpaul95
commited on
Commit
•
8390a54
1
Parent(s):
7980958
Update eval_utils.py
Browse files- eval_utils.py +12 -15
eval_utils.py
CHANGED
@@ -7,7 +7,6 @@ import nltk
|
|
7 |
import numpy as np
|
8 |
|
9 |
from nervaluate import Evaluator
|
10 |
-
# from rouge_score import rouge_scorer
|
11 |
from sacrebleu.metrics import BLEU, CHRF
|
12 |
from sklearn.metrics import f1_score
|
13 |
from tqdm import tqdm
|
@@ -37,7 +36,7 @@ def evaluate_bail(gold_data, pred_data):
|
|
37 |
|
38 |
f1 = f1_score(gold_labels, pred_labels, average="macro")
|
39 |
print("Macro-F1 on HLDC-all-districts test set:", f1)
|
40 |
-
return {"mF1": f1}
|
41 |
|
42 |
def get_BLEU_score(ref_text_all, machine_text_all):
|
43 |
sc_all = []
|
@@ -90,7 +89,7 @@ def evaluate_cjpe(gold_data, pred_data):
|
|
90 |
}
|
91 |
print("Explanability for ILDC Expert:", explanation_result)
|
92 |
#return {**prediction_result, **explanation_result}
|
93 |
-
return {"mF1": f1, "ROUGE-L": rouge_score, "BLEU": bleu_score}
|
94 |
|
95 |
def span2bio(txt, roles):
|
96 |
roles = sorted(roles, key = lambda x:x['start'])
|
@@ -162,7 +161,7 @@ def evaluate_lner(gold_data, pred_data, text_data):
|
|
162 |
results_per_fold[f"fold_{fold}"] = avg_f1
|
163 |
|
164 |
print("Strict macro-F1 on L-NER Dataset:", results_per_fold)
|
165 |
-
return {"strict mF1": sum(results_per_fold.values())/len(results_per_fold)}
|
166 |
|
167 |
|
168 |
def evaluate_rr(gold_data, pred_data):
|
@@ -188,7 +187,7 @@ def evaluate_rr(gold_data, pred_data):
|
|
188 |
|
189 |
f1 = f1_score(all_gold_labels, all_pred_labels, average="macro")
|
190 |
print(f"Macro-F1 on combined test set:", f1)
|
191 |
-
return {"mF1": f1}
|
192 |
|
193 |
|
194 |
def evaluate_lsi(gold_data, pred_data):
|
@@ -211,7 +210,7 @@ def evaluate_lsi(gold_data, pred_data):
|
|
211 |
|
212 |
f1 = f1_score(gold_matrix, pred_matrix, average="macro")
|
213 |
print("Macro-F1 on ILSI test set:", f1)
|
214 |
-
return {"mF1": f1}
|
215 |
|
216 |
|
217 |
def evaluate_pcr(gold_data, pred_data):
|
@@ -241,7 +240,7 @@ def evaluate_pcr(gold_data, pred_data):
|
|
241 |
|
242 |
max_f1 = max(f1_scores)
|
243 |
index_max = f1_scores.index(max_f1) + 1
|
244 |
-
return {"muF1@K": f"{max_f1:.2f}@{index_max}"}
|
245 |
|
246 |
|
247 |
def evaluate_summ(gold_data, pred_data):
|
@@ -257,15 +256,13 @@ def evaluate_summ(gold_data, pred_data):
|
|
257 |
pred_summaries.append(pred_summary)
|
258 |
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
|
264 |
_, _, bs = bert_score.score(pred_summaries, gold_summaries, lang="en", verbose=True)
|
265 |
print("BERTSCORE:", bs.mean().item())
|
266 |
-
|
267 |
-
return {'ROUGE-L': '-', 'BERTSCORE': bs.mean().item()}
|
268 |
-
|
269 |
|
270 |
def evaluate_lmt(gold_data, pred_data):
|
271 |
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert", use_fast=False)
|
@@ -308,14 +305,14 @@ def evaluate_lmt(gold_data, pred_data):
|
|
308 |
|
309 |
return {
|
310 |
"BLEU": sum(bleu_scores) / len(bleu_scores),
|
311 |
-
"GLEU": sum(gleu_scores) / len(gleu_scores),
|
312 |
"chrF++": sum(chrfpp_scores) / len(chrfpp_scores),
|
313 |
}
|
314 |
|
315 |
|
316 |
def create_output_json(evaluation_results):
|
317 |
output = {
|
318 |
-
"Method": "Dummy Ideal Only Summ",
|
319 |
"Submitted By": "IL-TUR",
|
320 |
"Github Link": "dummy submission",
|
321 |
"L-NER": {"strict mF1": evaluation_results["lner"]["strict mF1"]},
|
|
|
7 |
import numpy as np
|
8 |
|
9 |
from nervaluate import Evaluator
|
|
|
10 |
from sacrebleu.metrics import BLEU, CHRF
|
11 |
from sklearn.metrics import f1_score
|
12 |
from tqdm import tqdm
|
|
|
36 |
|
37 |
f1 = f1_score(gold_labels, pred_labels, average="macro")
|
38 |
print("Macro-F1 on HLDC-all-districts test set:", f1)
|
39 |
+
return {"mF1": f1*100}
|
40 |
|
41 |
def get_BLEU_score(ref_text_all, machine_text_all):
|
42 |
sc_all = []
|
|
|
89 |
}
|
90 |
print("Explanability for ILDC Expert:", explanation_result)
|
91 |
#return {**prediction_result, **explanation_result}
|
92 |
+
return {"mF1": f1*100, "ROUGE-L": rouge_score*100, "BLEU": bleu_score*100}
|
93 |
|
94 |
def span2bio(txt, roles):
|
95 |
roles = sorted(roles, key = lambda x:x['start'])
|
|
|
161 |
results_per_fold[f"fold_{fold}"] = avg_f1
|
162 |
|
163 |
print("Strict macro-F1 on L-NER Dataset:", results_per_fold)
|
164 |
+
return {"strict mF1": sum(results_per_fold.values())/len(results_per_fold)*100}
|
165 |
|
166 |
|
167 |
def evaluate_rr(gold_data, pred_data):
|
|
|
187 |
|
188 |
f1 = f1_score(all_gold_labels, all_pred_labels, average="macro")
|
189 |
print(f"Macro-F1 on combined test set:", f1)
|
190 |
+
return {"mF1": f1*100}
|
191 |
|
192 |
|
193 |
def evaluate_lsi(gold_data, pred_data):
|
|
|
210 |
|
211 |
f1 = f1_score(gold_matrix, pred_matrix, average="macro")
|
212 |
print("Macro-F1 on ILSI test set:", f1)
|
213 |
+
return {"mF1": f1*100}
|
214 |
|
215 |
|
216 |
def evaluate_pcr(gold_data, pred_data):
|
|
|
240 |
|
241 |
max_f1 = max(f1_scores)
|
242 |
index_max = f1_scores.index(max_f1) + 1
|
243 |
+
return {"muF1@K": f"{max_f1*100:.2f}@{index_max}"}
|
244 |
|
245 |
|
246 |
def evaluate_summ(gold_data, pred_data):
|
|
|
256 |
pred_summaries.append(pred_summary)
|
257 |
|
258 |
|
259 |
+
rl_evaluator = rouge.Rouge(metrics=['rouge-n','rouge-l'], max_n=2, limit_length=False, apply_avg=True)
|
260 |
+
rl_scores = rl_evaluator.get_scores(pred_summaries, gold_summaries)
|
261 |
+
print("Rouge:", {k:v['f'] for k,v in rl_scores.items()}, flush=True)
|
262 |
|
263 |
_, _, bs = bert_score.score(pred_summaries, gold_summaries, lang="en", verbose=True)
|
264 |
print("BERTSCORE:", bs.mean().item())
|
265 |
+
return {'ROUGE-L': rl_scores['rouge-l']['f'] * 100, 'BERTSCORE': bs.mean().item() * 100}
|
|
|
|
|
266 |
|
267 |
def evaluate_lmt(gold_data, pred_data):
|
268 |
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert", use_fast=False)
|
|
|
305 |
|
306 |
return {
|
307 |
"BLEU": sum(bleu_scores) / len(bleu_scores),
|
308 |
+
"GLEU": sum(gleu_scores) / len(gleu_scores) * 100,
|
309 |
"chrF++": sum(chrfpp_scores) / len(chrfpp_scores),
|
310 |
}
|
311 |
|
312 |
|
313 |
def create_output_json(evaluation_results):
|
314 |
output = {
|
315 |
+
"Method": "Dummy Ideal Only Summ 2",
|
316 |
"Submitted By": "IL-TUR",
|
317 |
"Github Link": "dummy submission",
|
318 |
"L-NER": {"strict mF1": evaluation_results["lner"]["strict mF1"]},
|