yu-val-weiss commited on
Commit
b8756a1
·
1 Parent(s): 0a5e4ab

remove numpy, switch to torch (avoid to/from cpu as much)

Browse files
Files changed (2) hide show
  1. blimp.py +17 -17
  2. requirements.txt +0 -1
blimp.py CHANGED
@@ -18,7 +18,6 @@ from typing import Optional
18
 
19
  import datasets
20
  import evaluate
21
- import numpy as np
22
  import torch
23
  from evaluate import logging
24
  from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -175,7 +174,8 @@ class Blimp(evaluate.Metric):
175
  else ("mps" if torch.mps.is_available() else "cpu")
176
  )
177
 
178
- if samples_per_set is None or samples_per_set <= 0:
 
179
  samples_per_set = 1000
180
 
181
  model = AutoModelForCausalLM.from_pretrained(
@@ -226,13 +226,11 @@ class Blimp(evaluate.Metric):
226
 
227
  # Prepare batches of good and bad sentences
228
 
229
- phenom = dataset[0]["linguistics_term"]
230
-
231
  sents = [(x["sentence_good"], x["sentence_bad"]) for x in dataset]
232
- good_sents, bad_sents = zip(*sents[: min(1000, samples_per_set)])
233
 
234
  # Get probabilities in batches
235
- good_probs = get_batch_probabilities(
236
  model,
237
  tokenizer,
238
  good_sents,
@@ -241,7 +239,7 @@ class Blimp(evaluate.Metric):
241
  category,
242
  sent_type="good",
243
  )
244
- bad_probs = get_batch_probabilities(
245
  model,
246
  tokenizer,
247
  bad_sents,
@@ -251,22 +249,24 @@ class Blimp(evaluate.Metric):
251
  sent_type="bad",
252
  )
253
 
254
- # compute accuracy (mean of instances where good prob > bad prob)
255
- accuracy = np.mean(good_probs > bad_probs)
 
 
256
 
257
- results[category] = accuracy
258
- phenom_results[phenom].append(accuracy)
259
 
260
  return {
261
  "by_uid": results,
262
- "accuracy": np.mean(list(results.values())),
263
  "by_phenomenon": {
264
- term: np.mean(acc) for term, acc in phenom_results.items()
265
  },
266
  }
267
 
268
 
269
- def get_batch_probabilities(
270
  model,
271
  tokenizer,
272
  sentences: list[str],
@@ -276,7 +276,7 @@ def get_batch_probabilities(
276
  sent_type: str = "good",
277
  ):
278
  """Compute log probabilities for a batch of sentences"""
279
- probs = []
280
 
281
  for i in logging.tqdm(
282
  range(0, len(sentences), batch_size),
@@ -307,6 +307,6 @@ def get_batch_probabilities(
307
  # sum log probabilities
308
  sequence_log_probs = token_log_probs.sum(dim=1)
309
 
310
- probs.append(sequence_log_probs.cpu().numpy())
311
 
312
- return np.concatenate(probs)
 
18
 
19
  import datasets
20
  import evaluate
 
21
  import torch
22
  from evaluate import logging
23
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
174
  else ("mps" if torch.mps.is_available() else "cpu")
175
  )
176
 
177
+ samples_per_set = 1000 if samples_per_set is None else samples_per_set
178
+ if samples_per_set <= 0 or samples_per_set > 1000:
179
  samples_per_set = 1000
180
 
181
  model = AutoModelForCausalLM.from_pretrained(
 
226
 
227
  # Prepare batches of good and bad sentences
228
 
 
 
229
  sents = [(x["sentence_good"], x["sentence_bad"]) for x in dataset]
230
+ good_sents, bad_sents = zip(*sents[:samples_per_set])
231
 
232
  # Get probabilities in batches
233
+ good_probs = _get_batch_probabilities(
234
  model,
235
  tokenizer,
236
  good_sents,
 
239
  category,
240
  sent_type="good",
241
  )
242
+ bad_probs = _get_batch_probabilities(
243
  model,
244
  tokenizer,
245
  bad_sents,
 
249
  sent_type="bad",
250
  )
251
 
252
+ # compute accuracy (mean of instances where good prob > bad prob) for this UID
253
+ sub_acc = (good_probs > bad_probs).float().mean().item()
254
+
255
+ phenom = dataset[0]["linguistics_term"]
256
 
257
+ results[category] = sub_acc
258
+ phenom_results[phenom].append(sub_acc)
259
 
260
  return {
261
  "by_uid": results,
262
+ "accuracy": sum(results.values()) / len(results), # overall accuracy
263
  "by_phenomenon": {
264
+ term: sum(acc) / len(acc) for term, acc in phenom_results.items()
265
  },
266
  }
267
 
268
 
269
+ def _get_batch_probabilities(
270
  model,
271
  tokenizer,
272
  sentences: list[str],
 
276
  sent_type: str = "good",
277
  ):
278
  """Compute log probabilities for a batch of sentences"""
279
+ probs = torch.zeros(len(sentences))
280
 
281
  for i in logging.tqdm(
282
  range(0, len(sentences), batch_size),
 
307
  # sum log probabilities
308
  sequence_log_probs = token_log_probs.sum(dim=1)
309
 
310
+ probs[i : i + batch_size] = sequence_log_probs
311
 
312
+ return probs
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
  git+https://github.com/huggingface/evaluate@5aa3982a9a8c86e506860e381d428a64b0cce73b
2
  torch
3
  transformers
4
- numpy
 
1
  git+https://github.com/huggingface/evaluate@5aa3982a9a8c86e506860e381d428a64b0cce73b
2
  torch
3
  transformers