|
import csv
|
|
import glob
|
|
import pickle
|
|
import re
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
import numpy
|
|
from bert_score import BERTScorer
|
|
from evaluation import f1_score
|
|
|
|
import evaluate
|
|
|
|
rouge = evaluate.load('rouge')
|
|
|
|
bertscore = BERTScorer(model_type='microsoft/deberta-xlarge-mnli', device='cuda')
|
|
_main_path = 'public_ckpt'
|
|
|
|
ADD_METEOR = True
|
|
if ADD_METEOR:
|
|
meteor_scorer = evaluate.load('meteor')
|
|
|
|
DO_PRED_CLEAN = True
|
|
|
|
|
|
|
|
def evaluate_folder(main_path, skip_exists=True):
|
|
results_path = f'{main_path}/results.txt'
|
|
results_csv_path = f'{main_path}/results.csv'
|
|
paths = glob.glob(f'{main_path}/*/evaluation_result*.pkl')
|
|
all_results = []
|
|
csv_results = []
|
|
csv_results.append(['path',
|
|
'ppl',
|
|
'F1',
|
|
'bleu',
|
|
'bleu-1',
|
|
'bleu-2',
|
|
'bleu-3',
|
|
'bleu-4',
|
|
'rouge1',
|
|
'rouge2',
|
|
'rougel',
|
|
'BERT f1',
|
|
'BERT precision',
|
|
'BERT recall',
|
|
'dist-1',
|
|
'dist-2',
|
|
'meteor',
|
|
'valid_num'])
|
|
for path in paths:
|
|
with open(path, 'rb') as file:
|
|
results = pickle.load(file)
|
|
if results.get('result_str') is not None and skip_exists:
|
|
all_results.append(results['result_str'])
|
|
csv_results.append(results['csv'])
|
|
continue
|
|
preds = results['pred_text']
|
|
clean_preds = []
|
|
if DO_PRED_CLEAN:
|
|
for pred in preds:
|
|
search_result = re.search('R:|Q:|Summary:|\n|\:', pred)
|
|
if search_result is not None:
|
|
clean_preds.append(pred[:search_result.span()[0]])
|
|
else:
|
|
clean_preds.append(pred)
|
|
preds = clean_preds
|
|
tgt = results['gt_text']
|
|
|
|
def bleu_score(prediction, ground_truths):
|
|
from sacrebleu import BLEU
|
|
bleu = BLEU()
|
|
score = bleu.corpus_score(prediction, ground_truths)
|
|
return score
|
|
|
|
bleu = bleu_score(preds, [tgt])
|
|
|
|
precision, recall, f1 = bertscore.score(preds, tgt, verbose=False, batch_size=64)
|
|
mean_precision = precision.mean().item()
|
|
mean_recall = recall.mean().item()
|
|
mean_f1 = f1.mean().item()
|
|
|
|
def eval_distinct(corpus):
|
|
unigrams = []
|
|
bigrams = []
|
|
for n, rep in enumerate(corpus):
|
|
rep = rep.strip()
|
|
temp = rep.split(' ')
|
|
unigrams += temp
|
|
for i in range(len(temp) - 1):
|
|
bigrams.append(temp[i] + ' ' + temp[i + 1])
|
|
distink_1 = len(set(unigrams)) * 1.0 / len(unigrams)
|
|
distink_2 = len(set(bigrams)) * 1.0 / len(bigrams)
|
|
return distink_1, distink_2
|
|
|
|
rouge_results = rouge.compute(predictions=preds, references=tgt)
|
|
rouge1, rouge2, rougel = rouge_results['rouge1'], rouge_results['rouge2'], rouge_results['rougeL']
|
|
me_score = 0
|
|
if ADD_METEOR:
|
|
me_score = meteor_scorer.compute(predictions=preds, references=tgt)['meteor']
|
|
from evaluation import rouge_score
|
|
_rouge = rouge_score(preds, [tgt])
|
|
f1 = [f1_score(p, [t]) for p, t in zip(preds, tgt)]
|
|
f1 = numpy.asfarray(f1).mean()
|
|
ppl=''
|
|
|
|
result_str = f"""
|
|
path: {path}
|
|
F1: {f1}
|
|
bleu: {bleu.score}
|
|
bleu detail: {bleu.precisions}
|
|
rouge1, rouge2, rougel: {rouge1, rouge2, rougel}
|
|
BERT f1: {mean_f1}
|
|
BERT precision: {mean_precision}
|
|
BERT recall: {mean_recall}
|
|
dist: {eval_distinct(preds)}
|
|
METEOR: {me_score}
|
|
valid_num: {len(preds)}
|
|
"""
|
|
csv_data = [path,
|
|
f1 * 100.0,
|
|
bleu.score,
|
|
*bleu.precisions,
|
|
rouge1 * 100.0,
|
|
rouge2 * 100.0,
|
|
rougel * 100.0,
|
|
mean_f1 * 100.0,
|
|
mean_precision * 100.0,
|
|
mean_recall * 100.0,
|
|
*eval_distinct(preds),
|
|
me_score,
|
|
len(preds)]
|
|
csv_results.append(csv_data)
|
|
print(result_str)
|
|
all_results.append(result_str)
|
|
with open(path, 'wb') as file:
|
|
results['result_str'] = result_str
|
|
results['csv'] = csv_data
|
|
pickle.dump(results, file)
|
|
|
|
with open(results_path, 'w') as file:
|
|
file.write("\n=====\n".join(all_results))
|
|
with open(results_csv_path, 'w') as file:
|
|
writer = csv.writer(file)
|
|
writer.writerows(csv_results)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
evaluate_folder(_main_path, skip_exists=False)
|
|
|