yu-val-weiss
commited on
Commit
·
b8756a1
1
Parent(s):
0a5e4ab
remove numpy, switch to torch (avoid to/from cpu as much)
Browse files- blimp.py +17 -17
- requirements.txt +0 -1
blimp.py
CHANGED
@@ -18,7 +18,6 @@ from typing import Optional
|
|
18 |
|
19 |
import datasets
|
20 |
import evaluate
|
21 |
-
import numpy as np
|
22 |
import torch
|
23 |
from evaluate import logging
|
24 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
@@ -175,7 +174,8 @@ class Blimp(evaluate.Metric):
|
|
175 |
else ("mps" if torch.mps.is_available() else "cpu")
|
176 |
)
|
177 |
|
178 |
-
if samples_per_set is None
|
|
|
179 |
samples_per_set = 1000
|
180 |
|
181 |
model = AutoModelForCausalLM.from_pretrained(
|
@@ -226,13 +226,11 @@ class Blimp(evaluate.Metric):
|
|
226 |
|
227 |
# Prepare batches of good and bad sentences
|
228 |
|
229 |
-
phenom = dataset[0]["linguistics_term"]
|
230 |
-
|
231 |
sents = [(x["sentence_good"], x["sentence_bad"]) for x in dataset]
|
232 |
-
good_sents, bad_sents = zip(*sents[:
|
233 |
|
234 |
# Get probabilities in batches
|
235 |
-
good_probs =
|
236 |
model,
|
237 |
tokenizer,
|
238 |
good_sents,
|
@@ -241,7 +239,7 @@ class Blimp(evaluate.Metric):
|
|
241 |
category,
|
242 |
sent_type="good",
|
243 |
)
|
244 |
-
bad_probs =
|
245 |
model,
|
246 |
tokenizer,
|
247 |
bad_sents,
|
@@ -251,22 +249,24 @@ class Blimp(evaluate.Metric):
|
|
251 |
sent_type="bad",
|
252 |
)
|
253 |
|
254 |
-
# compute accuracy (mean of instances where good prob > bad prob)
|
255 |
-
|
|
|
|
|
256 |
|
257 |
-
results[category] =
|
258 |
-
phenom_results[phenom].append(
|
259 |
|
260 |
return {
|
261 |
"by_uid": results,
|
262 |
-
"accuracy":
|
263 |
"by_phenomenon": {
|
264 |
-
term:
|
265 |
},
|
266 |
}
|
267 |
|
268 |
|
269 |
-
def
|
270 |
model,
|
271 |
tokenizer,
|
272 |
sentences: list[str],
|
@@ -276,7 +276,7 @@ def get_batch_probabilities(
|
|
276 |
sent_type: str = "good",
|
277 |
):
|
278 |
"""Compute log probabilities for a batch of sentences"""
|
279 |
-
probs =
|
280 |
|
281 |
for i in logging.tqdm(
|
282 |
range(0, len(sentences), batch_size),
|
@@ -307,6 +307,6 @@ def get_batch_probabilities(
|
|
307 |
# sum log probabilities
|
308 |
sequence_log_probs = token_log_probs.sum(dim=1)
|
309 |
|
310 |
-
probs
|
311 |
|
312 |
-
return
|
|
|
18 |
|
19 |
import datasets
|
20 |
import evaluate
|
|
|
21 |
import torch
|
22 |
from evaluate import logging
|
23 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
174 |
else ("mps" if torch.mps.is_available() else "cpu")
|
175 |
)
|
176 |
|
177 |
+
samples_per_set = 1000 if samples_per_set is None else samples_per_set
|
178 |
+
if samples_per_set <= 0 or samples_per_set > 1000:
|
179 |
samples_per_set = 1000
|
180 |
|
181 |
model = AutoModelForCausalLM.from_pretrained(
|
|
|
226 |
|
227 |
# Prepare batches of good and bad sentences
|
228 |
|
|
|
|
|
229 |
sents = [(x["sentence_good"], x["sentence_bad"]) for x in dataset]
|
230 |
+
good_sents, bad_sents = zip(*sents[:samples_per_set])
|
231 |
|
232 |
# Get probabilities in batches
|
233 |
+
good_probs = _get_batch_probabilities(
|
234 |
model,
|
235 |
tokenizer,
|
236 |
good_sents,
|
|
|
239 |
category,
|
240 |
sent_type="good",
|
241 |
)
|
242 |
+
bad_probs = _get_batch_probabilities(
|
243 |
model,
|
244 |
tokenizer,
|
245 |
bad_sents,
|
|
|
249 |
sent_type="bad",
|
250 |
)
|
251 |
|
252 |
+
# compute accuracy (mean of instances where good prob > bad prob) for this UID
|
253 |
+
sub_acc = (good_probs > bad_probs).float().mean().item()
|
254 |
+
|
255 |
+
phenom = dataset[0]["linguistics_term"]
|
256 |
|
257 |
+
results[category] = sub_acc
|
258 |
+
phenom_results[phenom].append(sub_acc)
|
259 |
|
260 |
return {
|
261 |
"by_uid": results,
|
262 |
+
"accuracy": sum(results.values()) / len(results), # overall accuracy
|
263 |
"by_phenomenon": {
|
264 |
+
term: sum(acc) / len(acc) for term, acc in phenom_results.items()
|
265 |
},
|
266 |
}
|
267 |
|
268 |
|
269 |
+
def _get_batch_probabilities(
|
270 |
model,
|
271 |
tokenizer,
|
272 |
sentences: list[str],
|
|
|
276 |
sent_type: str = "good",
|
277 |
):
|
278 |
"""Compute log probabilities for a batch of sentences"""
|
279 |
+
probs = torch.zeros(len(sentences))
|
280 |
|
281 |
for i in logging.tqdm(
|
282 |
range(0, len(sentences), batch_size),
|
|
|
307 |
# sum log probabilities
|
308 |
sequence_log_probs = token_log_probs.sum(dim=1)
|
309 |
|
310 |
+
probs[i : i + batch_size] = sequence_log_probs
|
311 |
|
312 |
+
return probs
|
requirements.txt
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
git+https://github.com/huggingface/evaluate@5aa3982a9a8c86e506860e381d428a64b0cce73b
|
2 |
torch
|
3 |
transformers
|
4 |
-
numpy
|
|
|
1 |
git+https://github.com/huggingface/evaluate@5aa3982a9a8c86e506860e381d428a64b0cce73b
|
2 |
torch
|
3 |
transformers
|
|