jgauthier commited on
Commit
5cd2907
1 Parent(s): 092c6b1

update metric to support new dataset repr, and evaluating multiple suites in one call

Browse files
Files changed (3) hide show
  1. README.md +24 -7
  2. syntaxgym.py +19 -7
  3. test.py +10 -8
README.md CHANGED
@@ -27,23 +27,40 @@ The metric takes a SyntaxGym test suite as input, as well as the name of the mod
27
  ```python
28
  import datasets
29
  import evaluate
 
30
 
31
  dataset = datasets.load_dataset("cpllab/syntaxgym", "subordination_src-src")
32
  metric = evaluate.load("cpllab/syntaxgym")
33
- result = metric.compute(suite=dataset["test"], model_id="gpt2")
34
 
35
  # Compute suite accuracy. Mean success over items, where "success" is the conjunction
36
  # of all boolean prediction results.
37
- suite_accuracy = result["prediction_results"].all(axis=1).mean(axis=0)
38
  ```
39
 
40
  ### Run the entire SyntaxGym dataset
41
 
42
- TODO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  ### Inputs
45
 
46
- - **suite** (`Dataset`): SyntaxGym test suite, represented as a Huggingface dataset. See the [dataset reference][syntaxgym-dataset].
47
  - **model_id** (str): Model used to calculate probabilities of each word. (This is only well defined for causal language models. This includes models such as `gpt2`, causal variations of BERT, causal versions of T5, and more. The full list can be found in the [`AutoModelForCausalLM` documentation][causal].)
48
  - **batch_size** (int): Maximum batch size for computations
49
  - **add_start_token** (bool): whether to add the start token to each sentence. Defaults to `True`.
@@ -51,15 +68,15 @@ TODO
51
 
52
  ### Output Values
53
 
54
- The metric returns a dict with two entries:
55
 
56
  - **prediction_results** (`List[List[bool]]`): For each item in the test suite, a list of booleans indicating whether each corresponding prediction came out `True`. Typically these are combined to yield an accuracy score (see example usage above).
57
  - **region_totals** (`List[Dict[Tuple[str, int], float]`): For each item, a mapping from individual region (keys `(<condition_name>, <region_number>)`) to the float-valued total surprisal for tokens in this region. This is useful for visualization, or if you'd like to use the aggregate surprisal data for other tasks (e.g. reading time prediction or neural activity prediction).
58
 
59
  ```python
60
- >>> print(result["prediction_results"][0])
61
  [True]
62
- >>> print(result["region_totals"][0])
63
  {('sub_no-matrix', 1): 14.905603408813477,
64
  ('sub_no-matrix', 2): 39.063140869140625,
65
  ('sub_no-matrix', 3): 26.862628936767578,
 
27
  ```python
28
  import datasets
29
  import evaluate
30
+ import numpy as np
31
 
32
  dataset = datasets.load_dataset("cpllab/syntaxgym", "subordination_src-src")
33
  metric = evaluate.load("cpllab/syntaxgym")
34
+ result = metric.compute(dataset=dataset["test"], model_id="gpt2")
35
 
36
  # Compute suite accuracy. Mean success over items, where "success" is the conjunction
37
  # of all boolean prediction results.
38
+ suite_accuracy = np.array(result["subordination_src-src"]["prediction_results"]).all(axis=1).mean(axis=0)
39
  ```
40
 
41
  ### Run the entire SyntaxGym dataset
42
 
43
+ You can load and evaluate all suites at once by omitting the dataset configuration name (second argument):
44
+
45
+ ```python
46
+ import datasets
47
+ import evaluate
48
+ import numpy as np
49
+
50
+ dataset = datasets.load_dataset("cpllab/syntaxgym")
51
+ metric = evaluate.load("cpllab/syntaxgym")
52
+ result = metric.compute(dataset=dataset["test"], model_id="gpt2")
53
+
54
+ # Compute suite accuracy. Mean success over items, where "success" is the conjunction
55
+ # of all boolean prediction results.
56
+ suite_accuracies = [np.array(suite_results["prediction_results"]).all(axis=1).mean(axis=0)
57
+ for suite_results in result.values()]
58
+ overall_accuracy = np.mean(suite_accuracies.values())
59
+ ```
60
 
61
  ### Inputs
62
 
63
+ - **dataset** (`Dataset`): SyntaxGym test suite, represented as a Huggingface dataset. See the [dataset reference][syntaxgym-dataset].
64
  - **model_id** (str): Model used to calculate probabilities of each word. (This is only well defined for causal language models. This includes models such as `gpt2`, causal variations of BERT, causal versions of T5, and more. The full list can be found in the [`AutoModelForCausalLM` documentation][causal].)
65
  - **batch_size** (int): Maximum batch size for computations
66
  - **add_start_token** (bool): whether to add the start token to each sentence. Defaults to `True`.
 
68
 
69
  ### Output Values
70
 
71
+ The metric returns a dict of dicts, mapping test suite names to test suite performance. Each inner dict has two entries:
72
 
73
  - **prediction_results** (`List[List[bool]]`): For each item in the test suite, a list of booleans indicating whether each corresponding prediction came out `True`. Typically these are combined to yield an accuracy score (see example usage above).
74
  - **region_totals** (`List[Dict[Tuple[str, int], float]`): For each item, a mapping from individual region (keys `(<condition_name>, <region_number>)`) to the float-valued total surprisal for tokens in this region. This is useful for visualization, or if you'd like to use the aggregate surprisal data for other tasks (e.g. reading time prediction or neural activity prediction).
75
 
76
  ```python
77
+ >>> print(result["subordination_src-src"]["prediction_results"][0])
78
  [True]
79
+ >>> print(result["subordination_src-src"]["region_totals"][0])
80
  {('sub_no-matrix', 1): 14.905603408813477,
81
  ('sub_no-matrix', 2): 39.063140869140625,
82
  ('sub_no-matrix', 3): 26.862628936767578,
syntaxgym.py CHANGED
@@ -78,17 +78,25 @@ SUITE_DATASET_CONDITION_SPEC = {
78
 
79
 
80
  SUITE_DATASET_SPEC = {
 
81
  "item_number": datasets.Value("int32"),
82
  "conditions": datasets.Sequence(SUITE_DATASET_CONDITION_SPEC),
83
  "predictions": datasets.Sequence(datasets.Value("string")),
84
  }
85
 
86
 
87
- class SyntaxGymMetricResult(TypedDict):
 
 
 
 
88
  prediction_results: List[List[bool]]
89
  region_totals: List[Dict[Tuple[str, int], float]]
90
 
91
 
 
 
 
92
  def prepare_tokenizer(model, batch_size, add_start_token=True) -> Tuple[PreTrainedTokenizer, Dict]:
93
  """
94
  Load and prepare a tokenizer for SyntaxGym evaluation.
@@ -137,7 +145,7 @@ class SyntaxGym(evaluate.EvaluationModule):
137
  def _info(self):
138
  seq = datasets.Sequence
139
  features = datasets.Features({
140
- "suite": SUITE_DATASET_SPEC
141
  })
142
  return evaluate.EvaluationModuleInfo(
143
  module_type="metric",
@@ -149,7 +157,7 @@ class SyntaxGym(evaluate.EvaluationModule):
149
  codebase_urls=["https://github.com/cpllab/syntaxgym-core"],
150
  )
151
 
152
- def _compute(self, suite, model_id, batch_size=8, add_start_token=False, device=None) -> SyntaxGymMetricResult:
153
  if device is not None:
154
  assert device in ["gpu", "cpu", "cuda"]
155
  if device == "gpu":
@@ -163,14 +171,18 @@ class SyntaxGym(evaluate.EvaluationModule):
163
 
164
  tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
165
 
166
- results = {"prediction_results": [], "region_totals": []}
 
167
  # TODO batch all items together
168
- for item in datasets.logging.tqdm(suite):
169
  result_single = self._compute_single(item, tokenizer, tokenizer_kwargs,
170
  model, device)
171
 
172
- for k in ["prediction_results", "region_totals"]:
173
- results[k].append(result_single[k])
 
 
 
174
 
175
  return results
176
 
 
78
 
79
 
80
  SUITE_DATASET_SPEC = {
81
+ "suite_name": datasets.Value("string"),
82
  "item_number": datasets.Value("int32"),
83
  "conditions": datasets.Sequence(SUITE_DATASET_CONDITION_SPEC),
84
  "predictions": datasets.Sequence(datasets.Value("string")),
85
  }
86
 
87
 
88
+ class SyntaxGymMetricSuiteResult(TypedDict):
89
+ """
90
+ Evaluation results for a single suite.
91
+ """
92
+ suite_name: str
93
  prediction_results: List[List[bool]]
94
  region_totals: List[Dict[Tuple[str, int], float]]
95
 
96
 
97
+ SyntaxGymMetricResult = Dict[str, SyntaxGymMetricSuiteResult]
98
+
99
+
100
  def prepare_tokenizer(model, batch_size, add_start_token=True) -> Tuple[PreTrainedTokenizer, Dict]:
101
  """
102
  Load and prepare a tokenizer for SyntaxGym evaluation.
 
145
  def _info(self):
146
  seq = datasets.Sequence
147
  features = datasets.Features({
148
+ "dataset": SUITE_DATASET_SPEC
149
  })
150
  return evaluate.EvaluationModuleInfo(
151
  module_type="metric",
 
157
  codebase_urls=["https://github.com/cpllab/syntaxgym-core"],
158
  )
159
 
160
+ def _compute(self, dataset, model_id, batch_size=8, add_start_token=False, device=None) -> SyntaxGymMetricResult:
161
  if device is not None:
162
  assert device in ["gpu", "cpu", "cuda"]
163
  if device == "gpu":
 
171
 
172
  tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
173
 
174
+ results = {}
175
+ result_keys = ["prediction_results", "region_totals"]
176
  # TODO batch all items together
177
+ for item in datasets.logging.tqdm(dataset):
178
  result_single = self._compute_single(item, tokenizer, tokenizer_kwargs,
179
  model, device)
180
 
181
+ suite_name = item["suite_name"]
182
+ if suite_name not in results:
183
+ results[suite_name] = {k: [] for k in result_keys}
184
+ for k in result_keys:
185
+ results[suite_name][k].append(result_single[k])
186
 
187
  return results
188
 
test.py CHANGED
@@ -495,20 +495,22 @@ def test_gpt_subordination_region_totals(syntaxgym_metric):
495
  implementation, using the same underlying `gpt2` model.
496
  """
497
 
498
- dataset = datasets.load_dataset("cpllab/syntaxgym", "subordination_src-src")
499
- result = syntaxgym_metric.compute(suite=dataset["test"], model_id="gpt2")
 
500
 
 
501
  from pprint import pprint
502
- pprint(result["region_totals"][0])
503
  pprint(GPT2_SUBORDINATION_SRC_REFERENCE[0])
504
 
505
- keys = result["region_totals"][0].keys()
506
  assert set(keys) == set(GPT2_SUBORDINATION_SRC_REFERENCE[0].keys())
507
 
508
- result_ndarray = np.concatenate([np.array([region_totals[key] for key in keys])
509
- for region_totals in result["region_totals"]])
510
- reference_ndarray = np.concatenate([np.array([region_totals[key] for key in keys])
511
- for region_totals in GPT2_SUBORDINATION_SRC_REFERENCE])
512
  pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
513
  key=lambda x: -x[1]))
514
  np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)
 
495
  implementation, using the same underlying `gpt2` model.
496
  """
497
 
498
+ suite_name = "subordination_src-src"
499
+ dataset = datasets.load_dataset("cpllab/syntaxgym", suite_name)
500
+ result = syntaxgym_metric.compute(dataset=dataset["test"], model_id="gpt2")
501
 
502
+ region_totals = result[suite_name]["region_totals"]
503
  from pprint import pprint
504
+ pprint(region_totals[0])
505
  pprint(GPT2_SUBORDINATION_SRC_REFERENCE[0])
506
 
507
+ keys = region_totals[0].keys()
508
  assert set(keys) == set(GPT2_SUBORDINATION_SRC_REFERENCE[0].keys())
509
 
510
+ result_ndarray = np.concatenate([np.array([region_totals_i[key] for key in keys])
511
+ for region_totals_i in region_totals])
512
+ reference_ndarray = np.concatenate([np.array([region_totals_i[key] for key in keys])
513
+ for region_totals_i in GPT2_SUBORDINATION_SRC_REFERENCE])
514
  pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
515
  key=lambda x: -x[1]))
516
  np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)