JohnnyBoy00's picture
Upload evaluation.py
028cc28
raw
history blame contribute delete
No virus
6.08 kB
import numpy as np
import torch
from evaluate import load as load_metric
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm
MAX_TARGET_LENGTH = 128
# load evaluation metrics
sacrebleu = load_metric('sacrebleu')
rouge = load_metric('rouge')
meteor = load_metric('meteor')
bertscore = load_metric('bertscore')
# use gpu if it's available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def flatten_list(l):
"""
Utility function to convert a list of lists into a flattened list
Params:
l (list of lists): list to be flattened
Returns:
A flattened list with the elements of the original list
"""
return [item for sublist in l for item in sublist]
def extract_feedback(predictions):
"""
Utility function to extract the feedback from the predictions of the model
Params:
predictions (list): complete model predictions
Returns:
feedback (list): extracted feedback from the model's predictions
"""
feedback = []
# iterate through predictions and try to extract predicted feedback
for pred in predictions:
try:
fb = pred.split(':', 1)[1]
except IndexError:
try:
if pred.lower().startswith('partially correct'):
fb = pred.split(' ', 1)[2]
else:
fb = pred.split(' ', 1)[1]
except IndexError:
fb = pred
feedback.append(fb.strip())
return feedback
def extract_labels(predictions):
"""
Utility function to extract the labels from the predictions of the model
Params:
predictions (list): complete model predictions
Returns:
feedback (list): extracted labels from the model's predictions
"""
labels = []
for pred in predictions:
if pred.lower().startswith('correct'):
label = 'Correct'
elif pred.lower().startswith('partially correct'):
label = 'Partially correct'
elif pred.lower().startswith('incorrect'):
label = 'Incorrect'
else:
label = 'Unknown label'
labels.append(label)
return labels
def compute_metrics(predictions, labels):
"""
Compute evaluation metrics from the predictions of the model
Params:
predictions (list): complete model predictions
labels (list): golden labels (previously tokenized)
Returns:
results (dict): dictionary with the computed evaluation metrics
predictions (list): list of the decoded predictions of the model
"""
# extract feedback and labels from the model's predictions
predicted_feedback = extract_feedback(predictions)
predicted_labels = extract_labels(predictions)
# extract feedback and labels from the golden labels
reference_feedback = [x.split('Feedback:', 1)[1].strip() for x in labels]
reference_labels = [x.split('Feedback:', 1)[0].strip() for x in labels]
# compute HF metrics
sacrebleu_score = sacrebleu.compute(predictions=predicted_feedback, references=[[x] for x in reference_feedback])['score']
rouge_score = rouge.compute(predictions=predicted_feedback, references=reference_feedback)['rouge2']
meteor_score = meteor.compute(predictions=predicted_feedback, references=reference_feedback)['meteor']
bert_score = bertscore.compute(
predictions=predicted_feedback,
references=reference_feedback,
lang='de',
model_type='bert-base-multilingual-cased',
rescale_with_baseline=True)
# use sklearn to compute accuracy and f1 score
reference_labels_np = np.array(reference_labels)
accuracy = accuracy_score(reference_labels_np, predicted_labels)
f1_weighted = f1_score(reference_labels_np, predicted_labels, average='weighted')
f1_macro = f1_score(
reference_labels_np,
predicted_labels,
average='macro',
labels=['Incorrect', 'Partially correct', 'Correct'])
results = {
'sacrebleu': sacrebleu_score,
'rouge': rouge_score,
'meteor': meteor_score,
'bert_score': np.array(bert_score['f1']).mean().item(),
'accuracy': accuracy,
'f1_weighted': f1_weighted,
'f1_macro': f1_macro
}
return results
def evaluate(model, tokenizer, dataloader):
"""
Evaluate model on the given dataset
Params:
model (PreTrainedModel): seq2seq model
tokenizer (PreTrainedTokenizer): tokenizer from HuggingFace
dataloader (torch Dataloader): dataloader of the dataset to be used for evaluation
Returns:
results (dict): dictionary with the computed evaluation metrics
predictions (list): list of the decoded predictions of the model
"""
decoded_preds, decoded_labels = [], []
model.eval()
# iterate through batchs in the dataloader
for batch in tqdm(dataloader):
with torch.no_grad():
batch = {k: v.to(device) for k, v in batch.items()}
# generate tokens from batch
generated_tokens = model.generate(
batch['input_ids'],
attention_mask=batch['attention_mask'],
max_length=MAX_TARGET_LENGTH
)
# get golden labels from batch
labels_batch = batch['labels']
# decode model predictions and golden labels
decoded_preds_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
decoded_labels_batch = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)
decoded_preds.append(decoded_preds_batch)
decoded_labels.append(decoded_labels_batch)
# convert predictions and golden labels into flattened lists
predictions = flatten_list(decoded_preds)
labels = flatten_list(decoded_labels)
# compute metrics based on predictions and golden labels
results = compute_metrics(predictions, labels)
return results, predictions