JohnnyBoy00 commited on
Commit
8841a3f
1 Parent(s): 0982cce

Upload evaluation.py

Browse files
Files changed (1) hide show
  1. evaluation.py +206 -0
evaluation.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+
4
+ from evaluate import load as load_metric
5
+
6
+ from sklearn.metrics import mean_squared_error
7
+ from tqdm.auto import tqdm
8
+
9
+ MAX_TARGET_LENGTH = 128
10
+
11
+ # load evaluation metrics
12
+ sacrebleu = load_metric('sacrebleu')
13
+ rouge = load_metric('rouge')
14
+ meteor = load_metric('meteor')
15
+ bertscore = load_metric('bertscore')
16
+
17
+ # use gpu if it's available
18
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
19
+
20
+ def flatten_list(l):
21
+ """
22
+ Utility function to convert a list of lists into a flattened list
23
+ Params:
24
+ l (list of lists): list to be flattened
25
+ Returns:
26
+ A flattened list with the elements of the original list
27
+ """
28
+ return [item for sublist in l for item in sublist]
29
+
30
+ def parse_float(value):
31
+ """
32
+ Utility function to parse a string into a float
33
+ Params:
34
+ value (string): value to be converted to float
35
+ Returns:
36
+ The float representation of the given string, or None if the string could
37
+ not be converted to a float
38
+ """
39
+ try:
40
+ float_value = float(value)
41
+ return float_value
42
+ except ValueError:
43
+ return None
44
+
45
+ def extract_scores(predictions):
46
+ """
47
+ Utility function to extract the scores from the predictions of the model
48
+ Params:
49
+ predictions (list): complete model predictions
50
+ Returns:
51
+ scores (list): extracted scores from the model's predictions
52
+ """
53
+ scores = []
54
+ # iterate through predictions and try to extract predicted score;
55
+ # if score could not be extracted, set it to None
56
+ for pred in predictions:
57
+ try:
58
+ score_string = pred.split(' ', 1)[0].strip()
59
+ score = parse_float(score_string)
60
+ except IndexError:
61
+ score = None
62
+ scores.append(score)
63
+
64
+ return scores
65
+
66
+ def extract_feedback(predictions):
67
+ """
68
+ Utility function to extract the feedback from the predictions of the model
69
+ Params:
70
+ predictions (list): complete model predictions
71
+ Returns:
72
+ feedback (list): extracted feedback from the model's predictions
73
+ """
74
+ feedback = []
75
+ # iterate through predictions and try to extract predicted feedback
76
+ for pred in predictions:
77
+ try:
78
+ fb = pred.split(':', 1)[1]
79
+ except IndexError:
80
+ try:
81
+ fb = pred.split(' ', 1)[1]
82
+ except IndexError:
83
+ fb = pred
84
+ feedback.append(fb.strip())
85
+
86
+ return feedback
87
+
88
+ def compute_rmse(predictions, labels):
89
+ """
90
+ Utility function to compute the root mean squared error of the
91
+ score predictions in relation to the golden label scores
92
+ Params:
93
+ predictions (list): model score predictions
94
+ labels (list): golden label scores
95
+ Returns:
96
+ (float, int): rmse of valid samples and number of invalid samples
97
+ """
98
+ # get indexes of valid score predictions
99
+ # (i.e., where the score is not None)
100
+ idx = np.where(np.array(predictions) != None)
101
+
102
+ # get size of the golden labels list and of
103
+ # the valid predictions array
104
+ labels_size = np.array(labels).size
105
+ valid_predictions_size = idx[0].size
106
+
107
+ # only compute rmse if valid score predictions were generated,
108
+ # otherwise set mse to 1
109
+ if valid_predictions_size > 0:
110
+ # calculate rmse from labels and predictions
111
+ valid_predictions = np.array(predictions)[idx]
112
+ score_labels = np.array(labels)[idx]
113
+ rmse = mean_squared_error(score_labels, valid_predictions, squared=False)
114
+
115
+ # cap mse at 1
116
+ if rmse > 1:
117
+ return 1, labels_size - valid_predictions_size
118
+
119
+ # return computed rmse and number of invalid samples
120
+ return rmse, labels_size - valid_predictions_size
121
+ else:
122
+ return 1, labels_size - valid_predictions_size
123
+
124
+ def compute_metrics(predictions, labels):
125
+ """
126
+ Compute evaluation metrics from the predictions of the model
127
+ Params:
128
+ predictions (list): complete model predictions
129
+ labels (list): golden labels (previously tokenized)
130
+ Returns:
131
+ results (dict): dictionary with the computed evaluation metrics
132
+ """
133
+ # extract feedback and labels from the model's predictions
134
+ predicted_feedback = extract_feedback(predictions)
135
+ predicted_scores = extract_scores(predictions)
136
+
137
+ # extract feedback and labels from the golden labels
138
+ reference_feedback = [x.split('Feedback:', 1)[1].strip() for x in labels]
139
+ reference_scores = [float(x.split('Feedback:', 1)[0].strip()) for x in labels]
140
+
141
+ # compute HF metrics
142
+ sacrebleu_score = sacrebleu.compute(predictions=predicted_feedback, references=[[x] for x in reference_feedback])['score']
143
+ rouge_score = rouge.compute(predictions=predicted_feedback, references=reference_feedback)['rouge2']
144
+ meteor_score = meteor.compute(predictions=predicted_feedback, references=reference_feedback)['meteor']
145
+ bert_score = bertscore.compute(
146
+ predictions=predicted_feedback,
147
+ references=reference_feedback,
148
+ lang='de',
149
+ model_type='bert-base-multilingual-cased',
150
+ rescale_with_baseline=True)
151
+
152
+ # compute rmse of score predictions
153
+ rmse, _ = compute_rmse(predicted_scores, reference_scores)
154
+
155
+ results = {
156
+ 'sacrebleu': sacrebleu_score,
157
+ 'rouge': rouge_score,
158
+ 'meteor': meteor_score,
159
+ 'bert_score': np.array(bert_score['f1']).mean().item(),
160
+ 'rmse': rmse
161
+ }
162
+
163
+ return results
164
+
165
+ def evaluate(model, tokenizer, dataloader):
166
+ """
167
+ Evaluate model on the given dataset
168
+ Params:
169
+ model (PreTrainedModel): seq2seq model
170
+ tokenizer (PreTrainedTokenizer): tokenizer from HuggingFace
171
+ dataloader (torch Dataloader): dataloader of the dataset to be used for evaluation
172
+ Returns:
173
+ results (dict): dictionary with the computed evaluation metrics
174
+ predictions (list): list of the decoded predictions of the model
175
+ """
176
+ decoded_preds, decoded_labels = [], []
177
+
178
+ model.eval()
179
+ # iterate through batchs in the dataloader
180
+ for batch in tqdm(dataloader):
181
+ with torch.no_grad():
182
+ batch = {k: v.to(device) for k, v in batch.items()}
183
+ # generate tokens from batch
184
+ generated_tokens = model.generate(
185
+ batch['input_ids'],
186
+ attention_mask=batch['attention_mask'],
187
+ max_length=MAX_TARGET_LENGTH
188
+ )
189
+ # get golden labels from batch
190
+ labels_batch = batch['labels']
191
+
192
+ # decode model predictions and golden labels
193
+ decoded_preds_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
194
+ decoded_labels_batch = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)
195
+
196
+ decoded_preds.append(decoded_preds_batch)
197
+ decoded_labels.append(decoded_labels_batch)
198
+
199
+ # convert predictions and golden labels into flattened lists
200
+ predictions = flatten_list(decoded_preds)
201
+ labels = flatten_list(decoded_labels)
202
+
203
+ # compute metrics based on predictions and golden labels
204
+ results = compute_metrics(predictions, labels)
205
+
206
+ return results, predictions