JohnnyBoy00 commited on
Commit
ef7e5e6
1 Parent(s): ee73a88

Upload evaluation.py

Browse files
Files changed (1) hide show
  1. evaluation.py +211 -0
evaluation.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+
4
+ from evaluate import load as load_metric
5
+
6
+ from sklearn.metrics import mean_squared_error
7
+ from tqdm.auto import tqdm
8
+
9
+ MAX_TARGET_LENGTH = 128
10
+
11
+ # load evaluation metrics
12
+ sacrebleu = load_metric('sacrebleu')
13
+ rouge = load_metric('rouge')
14
+ meteor = load_metric('meteor')
15
+ bertscore = load_metric('bertscore')
16
+
17
+ # use gpu if it's available
18
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
19
+
20
+ def flatten_list(l):
21
+ """
22
+ Utility function to convert a list of lists into a flattened list
23
+ Params:
24
+ l (list of lists): list to be flattened
25
+ Returns:
26
+ A flattened list with the elements of the original list
27
+ """
28
+ return [item for sublist in l for item in sublist]
29
+
30
+ def parse_float(value):
31
+ """
32
+ Utility function to parse a string into a float
33
+
34
+ Params:
35
+ value (string): value to be converted to float
36
+ Returns:
37
+ The float representation of the given string, or None if the string could
38
+ not be converted to a float
39
+ """
40
+ try:
41
+ float_value = float(value)
42
+ return float_value
43
+ except ValueError:
44
+ return None
45
+
46
+ def extract_scores(predictions):
47
+ """
48
+ Utility function to extract the scores from the predictions of the model
49
+
50
+ Params:
51
+ predictions (list): complete model predictions
52
+ Returns:
53
+ scores (list): extracted scores from the model's predictions
54
+ """
55
+ scores = []
56
+ # iterate through predictions and try to extract predicted score;
57
+ # if score could not be extracted, set it to None
58
+ for pred in predictions:
59
+ try:
60
+ score_string = pred.split(' ', 1)[0].strip()
61
+ score = parse_float(score_string)
62
+ except IndexError:
63
+ score = None
64
+ scores.append(score)
65
+
66
+ return scores
67
+
68
+ def extract_feedback(predictions):
69
+ """
70
+ Utility function to extract the feedback from the predictions of the model
71
+
72
+ Params:
73
+ predictions (list): complete model predictions
74
+ Returns:
75
+ feedback (list): extracted feedback from the model's predictions
76
+ """
77
+ feedback = []
78
+ # iterate through predictions and try to extract predicted feedback
79
+ for pred in predictions:
80
+ try:
81
+ fb = pred.split(':', 1)[1]
82
+ except IndexError:
83
+ try:
84
+ fb = pred.split(' ', 1)[1]
85
+ except IndexError:
86
+ fb = pred
87
+ feedback.append(fb.strip())
88
+
89
+ return feedback
90
+
91
+ def compute_rmse(predictions, labels):
92
+ """
93
+ Utility function to compute the root mean squared error of the
94
+ score predictions in relation to the golden label scores
95
+
96
+ Params:
97
+ predictions (list): model score predictions
98
+ labels (list): golden label scores
99
+ Returns:
100
+ (float, int): rmse of valid samples and number of invalid samples
101
+ """
102
+ # get indexes of valid score predictions
103
+ # (i.e., where the score is not None)
104
+ idx = np.where(np.array(predictions) != None)
105
+
106
+ # get size of the golden labels list and of
107
+ # the valid predictions array
108
+ labels_size = np.array(labels).size
109
+ valid_predictions_size = idx[0].size
110
+
111
+ # only compute rmse if valid score predictions were generated,
112
+ # otherwise set mse to 1
113
+ if valid_predictions_size > 0:
114
+ # calculate rmse from labels and predictions
115
+ valid_predictions = np.array(predictions)[idx]
116
+ score_labels = np.array(labels)[idx]
117
+ rmse = mean_squared_error(score_labels, valid_predictions, squared=False)
118
+
119
+ # cap mse at 1
120
+ if rmse > 1:
121
+ return 1, labels_size - valid_predictions_size
122
+
123
+ # return computed rmse and number of invalid samples
124
+ return rmse, labels_size - valid_predictions_size
125
+ else:
126
+ return 1, labels_size - valid_predictions_size
127
+
128
+ def compute_metrics(predictions, labels):
129
+ """
130
+ Compute evaluation metrics from the predictions of the model
131
+
132
+ Params:
133
+ predictions (list): complete model predictions
134
+ labels (list): golden labels (previously tokenized)
135
+ Returns:
136
+ results (dict): dictionary with the computed evaluation metrics
137
+ """
138
+ # extract feedback and labels from the model's predictions
139
+ predicted_feedback = extract_feedback(predictions)
140
+ predicted_scores = extract_scores(predictions)
141
+
142
+ # extract feedback and labels from the golden labels
143
+ reference_feedback = [x.split('Feedback:', 1)[1].strip() for x in labels]
144
+ reference_scores = [float(x.split('Feedback:', 1)[0].strip()) for x in labels]
145
+
146
+ # compute HF metrics
147
+ sacrebleu_score = sacrebleu.compute(predictions=predicted_feedback, references=[[x] for x in reference_feedback])['score']
148
+ rouge_score = rouge.compute(predictions=predicted_feedback, references=reference_feedback)['rouge2']
149
+ meteor_score = meteor.compute(predictions=predicted_feedback, references=reference_feedback)['meteor']
150
+ bert_score = bertscore.compute(
151
+ predictions=predicted_feedback,
152
+ references=reference_feedback,
153
+ lang='de',
154
+ model_type='bert-base-multilingual-cased',
155
+ rescale_with_baseline=True)
156
+
157
+ # compute rmse of score predictions
158
+ rmse, _ = compute_rmse(predicted_scores, reference_scores)
159
+
160
+ results = {
161
+ 'sacrebleu': sacrebleu_score,
162
+ 'rouge': rouge_score,
163
+ 'meteor': meteor_score,
164
+ 'bert_score': np.array(bert_score['f1']).mean().item(),
165
+ 'rmse': rmse
166
+ }
167
+
168
+ return results
169
+
170
+ def evaluate(model, tokenizer, dataloader):
171
+ """
172
+ Evaluate model on the given dataset
173
+ Params:
174
+ model (PreTrainedModel): seq2seq model
175
+ tokenizer (PreTrainedTokenizer): tokenizer from HuggingFace
176
+ dataloader (torch Dataloader): dataloader of the dataset to be used for evaluation
177
+ Returns:
178
+ results (dict): dictionary with the computed evaluation metrics
179
+ predictions (list): list of the decoded predictions of the model
180
+ """
181
+ decoded_preds, decoded_labels = [], []
182
+
183
+ model.eval()
184
+ # iterate through batchs in the dataloader
185
+ for batch in tqdm(dataloader):
186
+ with torch.no_grad():
187
+ batch = {k: v.to(device) for k, v in batch.items()}
188
+ # generate tokens from batch
189
+ generated_tokens = model.generate(
190
+ batch['input_ids'],
191
+ attention_mask=batch['attention_mask'],
192
+ max_length=MAX_TARGET_LENGTH
193
+ )
194
+ # get golden labels from batch
195
+ labels_batch = batch['labels']
196
+
197
+ # decode model predictions and golden labels
198
+ decoded_preds_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
199
+ decoded_labels_batch = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)
200
+
201
+ decoded_preds.append(decoded_preds_batch)
202
+ decoded_labels.append(decoded_labels_batch)
203
+
204
+ # convert predictions and golden labels into flattened lists
205
+ predictions = flatten_list(decoded_preds)
206
+ labels = flatten_list(decoded_labels)
207
+
208
+ # compute metrics based on predictions and golden labels
209
+ results = compute_metrics(predictions, labels)
210
+
211
+ return results, predictions