Jon Gauthier commited on
Commit
8cca3d0
1 Parent(s): 8fe0b5d

refactor metric to support evaluating `all-2020` split

Browse files
Files changed (2) hide show
  1. syntaxgym.py +37 -19
  2. test/test_syntaxgym.py +18 -1
syntaxgym.py CHANGED
@@ -187,14 +187,25 @@ class SyntaxGym(evaluate.EvaluationModule):
187
 
188
  tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
189
 
190
- # Flatten sentences, enforcing that sentences are always ordered by the same condition.
191
- condition_order = dataset[0]["conditions"]["condition_name"]
 
 
 
 
192
  all_sentences = []
 
 
 
 
193
  for item in dataset:
194
- for condition_name in condition_order:
195
  # Get idx of condition for this item.
196
  condition_idx = item["conditions"]["condition_name"].index(condition_name)
 
197
  all_sentences.append(item["conditions"]["content"][condition_idx])
 
 
198
 
199
  # Tokenize sentences and split into batches.
200
  all_tokenized_sentences = tokenizer(all_sentences, return_tensors="pt",
@@ -205,7 +216,7 @@ class SyntaxGym(evaluate.EvaluationModule):
205
  # Compute surprisal per-batch and combine into a single surprisal tensor.
206
  n_sentences, n_timesteps = all_tokenized_sentences["input_ids"].shape
207
  surprisals = torch.zeros(n_sentences, n_timesteps - 1).float().to(device)
208
- for i, batch in enumerate(datasets.logging.tqdm(tokenized_batches)) :
209
  batch = batch.to(device)
210
  with torch.no_grad():
211
  # logits are B * T * V
@@ -219,22 +230,29 @@ class SyntaxGym(evaluate.EvaluationModule):
219
 
220
  surprisals[i * batch_size : (i + 1) * batch_size] = b_surprisals_gt
221
 
222
- # Reshape to intuitive axes n_items * n_conditions * ...
223
- surprisals = surprisals.reshape((len(dataset), len(condition_order), -1))
224
- offset_mapping = all_tokenized_sentences["offset_mapping"] \
225
- .reshape((len(dataset), len(condition_order), -1, 2))
226
-
227
- # Now evaluate per-item.
228
  results = {}
229
- result_keys = ["prediction_results", "region_totals"]
230
- for item, item_surprisals, item_offset_mapping in zip(datasets.logging.tqdm(dataset), surprisals, offset_mapping):
231
- result_i = self._compute_item(item, item_surprisals, item_offset_mapping, condition_order)
232
-
233
- suite_name = item["suite_name"]
234
- if suite_name not in results:
235
- results[suite_name] = SyntaxGymMetricSuiteResult(suite_name, [], [])
236
- for k in result_keys:
237
- getattr(results[suite_name], k).append(result_i[k])
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
  return results
240
 
187
 
188
  tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
189
 
190
+ # Flatten sentences, enforcing that sentences are always ordered by the same condition
191
+ # within-suite.
192
+ condition_orders = {}
193
+ for item in dataset:
194
+ condition_orders[item["suite_name"]] = item["conditions"]["condition_name"]
195
+ # Flattened batch of sentences
196
  all_sentences = []
197
+ # Mapping from sentence back to originating suite
198
+ all_sentence_suites = []
199
+ # Mapping from item back to originating suite
200
+ all_item_suites = []
201
  for item in dataset:
202
+ for condition_name in condition_orders[item["suite_name"]]:
203
  # Get idx of condition for this item.
204
  condition_idx = item["conditions"]["condition_name"].index(condition_name)
205
+
206
  all_sentences.append(item["conditions"]["content"][condition_idx])
207
+ all_sentence_suites.append(item["suite_name"])
208
+ all_item_suites.append(item["suite_name"])
209
 
210
  # Tokenize sentences and split into batches.
211
  all_tokenized_sentences = tokenizer(all_sentences, return_tensors="pt",
216
  # Compute surprisal per-batch and combine into a single surprisal tensor.
217
  n_sentences, n_timesteps = all_tokenized_sentences["input_ids"].shape
218
  surprisals = torch.zeros(n_sentences, n_timesteps - 1).float().to(device)
219
+ for i, batch in enumerate(datasets.logging.tqdm(tokenized_batches, desc="Computing surprisals", unit="batch")) :
220
  batch = batch.to(device)
221
  with torch.no_grad():
222
  # logits are B * T * V
230
 
231
  surprisals[i * batch_size : (i + 1) * batch_size] = b_surprisals_gt
232
 
233
+ # Aggregate results within-suite
 
 
 
 
 
234
  results = {}
235
+ all_sentence_suites = np.array(all_sentence_suites)
236
+ all_item_suites = np.array(all_item_suites)
237
+ for suite, condition_order in datasets.logging.tqdm(condition_orders.items(), unit="suite"):
238
+ suite_sentence_idxs = np.where(all_sentence_suites == suite)[0]
239
+ suite_item_idxs = np.where(all_item_suites == suite)[0]
240
+ suite_surprisals = surprisals[suite_sentence_idxs]
241
+
242
+ # Reshape to intuitive axes n_items * n_conditions * ...
243
+ suite_surprisals = suite_surprisals.reshape((len(suite_item_idxs), len(condition_order), -1))
244
+ suite_offset_mapping = all_tokenized_sentences["offset_mapping"][suite_sentence_idxs] \
245
+ .reshape((len(suite_item_idxs), len(condition_order), -1, 2))
246
+
247
+ # Evaluate per-item
248
+ suite_result = SyntaxGymMetricSuiteResult(suite, [], [])
249
+ suite_items = datasets.logging.tqdm([dataset[idx] for idx in suite_item_idxs], unit="item")
250
+ for item, item_surprisals, item_offset_mapping in zip(suite_items, suite_surprisals, suite_offset_mapping):
251
+ result_i = self._compute_item(item, item_surprisals, item_offset_mapping, condition_order)
252
+ suite_result.prediction_results.append(result_i["prediction_results"])
253
+ suite_result.region_totals.append(result_i["region_totals"])
254
+
255
+ results[suite] = suite_result
256
 
257
  return results
258
 
test/test_syntaxgym.py CHANGED
@@ -513,4 +513,21 @@ def test_gpt_subordination_region_totals(syntaxgym_metric):
513
  for region_totals_i in GPT2_SUBORDINATION_SRC_REFERENCE])
514
  pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
515
  key=lambda x: -x[1]))
516
- np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
  for region_totals_i in GPT2_SUBORDINATION_SRC_REFERENCE])
514
  pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
515
  key=lambda x: -x[1]))
516
+ np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)
517
+
518
+
519
+ def test_evaluation_all_vs_single(syntaxgym_metric):
520
+ """
521
+ Check that a suite's performance is the same when evaluated in the composite
522
+ benchmark vs. evaluated independently.
523
+ """
524
+
525
+ suite_name = "number_prep"
526
+ full_dataset = datasets.load_dataset("cpllab/syntaxgym")
527
+ sub_dataset = datasets.load_dataset("cpllab/syntaxgym", suite_name)
528
+ model_id = "hf-internal-testing/tiny-xlm-roberta"
529
+
530
+ full_result = syntaxgym_metric.compute(dataset=full_dataset["test"], model_id=model_id)
531
+ sub_result = syntaxgym_metric.compute(dataset=sub_dataset["test"], model_id=model_id)
532
+
533
+ assert full_result[suite_name].prediction_results == sub_result[suite_name].prediction_results