Spaces:
Sleeping
Sleeping
update metric to support new dataset repr, and evaluating multiple suites in one call
Browse files- README.md +24 -7
- syntaxgym.py +19 -7
- test.py +10 -8
README.md
CHANGED
@@ -27,23 +27,40 @@ The metric takes a SyntaxGym test suite as input, as well as the name of the mod
|
|
27 |
```python
|
28 |
import datasets
|
29 |
import evaluate
|
|
|
30 |
|
31 |
dataset = datasets.load_dataset("cpllab/syntaxgym", "subordination_src-src")
|
32 |
metric = evaluate.load("cpllab/syntaxgym")
|
33 |
-
result = metric.compute(
|
34 |
|
35 |
# Compute suite accuracy. Mean success over items, where "success" is the conjunction
|
36 |
# of all boolean prediction results.
|
37 |
-
suite_accuracy = result["prediction_results"].all(axis=1).mean(axis=0)
|
38 |
```
|
39 |
|
40 |
### Run the entire SyntaxGym dataset
|
41 |
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
### Inputs
|
45 |
|
46 |
-
- **
|
47 |
- **model_id** (str): Model used to calculate probabilities of each word. (This is only well defined for causal language models. This includes models such as `gpt2`, causal variations of BERT, causal versions of T5, and more. The full list can be found in the [`AutoModelForCausalLM` documentation][causal].)
|
48 |
- **batch_size** (int): Maximum batch size for computations
|
49 |
- **add_start_token** (bool): whether to add the start token to each sentence. Defaults to `True`.
|
@@ -51,15 +68,15 @@ TODO
|
|
51 |
|
52 |
### Output Values
|
53 |
|
54 |
-
The metric returns a dict
|
55 |
|
56 |
- **prediction_results** (`List[List[bool]]`): For each item in the test suite, a list of booleans indicating whether each corresponding prediction came out `True`. Typically these are combined to yield an accuracy score (see example usage above).
|
57 |
- **region_totals** (`List[Dict[Tuple[str, int], float]`): For each item, a mapping from individual region (keys `(<condition_name>, <region_number>)`) to the float-valued total surprisal for tokens in this region. This is useful for visualization, or if you'd like to use the aggregate surprisal data for other tasks (e.g. reading time prediction or neural activity prediction).
|
58 |
|
59 |
```python
|
60 |
-
>>> print(result["prediction_results"][0])
|
61 |
[True]
|
62 |
-
>>> print(result["region_totals"][0])
|
63 |
{('sub_no-matrix', 1): 14.905603408813477,
|
64 |
('sub_no-matrix', 2): 39.063140869140625,
|
65 |
('sub_no-matrix', 3): 26.862628936767578,
|
|
|
27 |
```python
|
28 |
import datasets
|
29 |
import evaluate
|
30 |
+
import numpy as np
|
31 |
|
32 |
dataset = datasets.load_dataset("cpllab/syntaxgym", "subordination_src-src")
|
33 |
metric = evaluate.load("cpllab/syntaxgym")
|
34 |
+
result = metric.compute(dataset=dataset["test"], model_id="gpt2")
|
35 |
|
36 |
# Compute suite accuracy. Mean success over items, where "success" is the conjunction
|
37 |
# of all boolean prediction results.
|
38 |
+
suite_accuracy = np.array(result["subordination_src-src"]["prediction_results"]).all(axis=1).mean(axis=0)
|
39 |
```
|
40 |
|
41 |
### Run the entire SyntaxGym dataset
|
42 |
|
43 |
+
You can load and evaluate all suites at once by omitting the dataset configuration name (second argument):
|
44 |
+
|
45 |
+
```python
|
46 |
+
import datasets
|
47 |
+
import evaluate
|
48 |
+
import numpy as np
|
49 |
+
|
50 |
+
dataset = datasets.load_dataset("cpllab/syntaxgym")
|
51 |
+
metric = evaluate.load("cpllab/syntaxgym")
|
52 |
+
result = metric.compute(dataset=dataset["test"], model_id="gpt2")
|
53 |
+
|
54 |
+
# Compute suite accuracy. Mean success over items, where "success" is the conjunction
|
55 |
+
# of all boolean prediction results.
|
56 |
+
suite_accuracies = [np.array(suite_results["prediction_results"]).all(axis=1).mean(axis=0)
|
57 |
+
for suite_results in result.values()]
|
58 |
+
overall_accuracy = np.mean(suite_accuracies.values())
|
59 |
+
```
|
60 |
|
61 |
### Inputs
|
62 |
|
63 |
+
- **dataset** (`Dataset`): SyntaxGym test suite, represented as a Huggingface dataset. See the [dataset reference][syntaxgym-dataset].
|
64 |
- **model_id** (str): Model used to calculate probabilities of each word. (This is only well defined for causal language models. This includes models such as `gpt2`, causal variations of BERT, causal versions of T5, and more. The full list can be found in the [`AutoModelForCausalLM` documentation][causal].)
|
65 |
- **batch_size** (int): Maximum batch size for computations
|
66 |
- **add_start_token** (bool): whether to add the start token to each sentence. Defaults to `True`.
|
|
|
68 |
|
69 |
### Output Values
|
70 |
|
71 |
+
The metric returns a dict of dicts, mapping test suite names to test suite performance. Each inner dict has two entries:
|
72 |
|
73 |
- **prediction_results** (`List[List[bool]]`): For each item in the test suite, a list of booleans indicating whether each corresponding prediction came out `True`. Typically these are combined to yield an accuracy score (see example usage above).
|
74 |
- **region_totals** (`List[Dict[Tuple[str, int], float]`): For each item, a mapping from individual region (keys `(<condition_name>, <region_number>)`) to the float-valued total surprisal for tokens in this region. This is useful for visualization, or if you'd like to use the aggregate surprisal data for other tasks (e.g. reading time prediction or neural activity prediction).
|
75 |
|
76 |
```python
|
77 |
+
>>> print(result["subordination_src-src"]["prediction_results"][0])
|
78 |
[True]
|
79 |
+
>>> print(result["subordination_src-src"]["region_totals"][0])
|
80 |
{('sub_no-matrix', 1): 14.905603408813477,
|
81 |
('sub_no-matrix', 2): 39.063140869140625,
|
82 |
('sub_no-matrix', 3): 26.862628936767578,
|
syntaxgym.py
CHANGED
@@ -78,17 +78,25 @@ SUITE_DATASET_CONDITION_SPEC = {
|
|
78 |
|
79 |
|
80 |
SUITE_DATASET_SPEC = {
|
|
|
81 |
"item_number": datasets.Value("int32"),
|
82 |
"conditions": datasets.Sequence(SUITE_DATASET_CONDITION_SPEC),
|
83 |
"predictions": datasets.Sequence(datasets.Value("string")),
|
84 |
}
|
85 |
|
86 |
|
87 |
-
class
|
|
|
|
|
|
|
|
|
88 |
prediction_results: List[List[bool]]
|
89 |
region_totals: List[Dict[Tuple[str, int], float]]
|
90 |
|
91 |
|
|
|
|
|
|
|
92 |
def prepare_tokenizer(model, batch_size, add_start_token=True) -> Tuple[PreTrainedTokenizer, Dict]:
|
93 |
"""
|
94 |
Load and prepare a tokenizer for SyntaxGym evaluation.
|
@@ -137,7 +145,7 @@ class SyntaxGym(evaluate.EvaluationModule):
|
|
137 |
def _info(self):
|
138 |
seq = datasets.Sequence
|
139 |
features = datasets.Features({
|
140 |
-
"
|
141 |
})
|
142 |
return evaluate.EvaluationModuleInfo(
|
143 |
module_type="metric",
|
@@ -149,7 +157,7 @@ class SyntaxGym(evaluate.EvaluationModule):
|
|
149 |
codebase_urls=["https://github.com/cpllab/syntaxgym-core"],
|
150 |
)
|
151 |
|
152 |
-
def _compute(self,
|
153 |
if device is not None:
|
154 |
assert device in ["gpu", "cpu", "cuda"]
|
155 |
if device == "gpu":
|
@@ -163,14 +171,18 @@ class SyntaxGym(evaluate.EvaluationModule):
|
|
163 |
|
164 |
tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
|
165 |
|
166 |
-
results = {
|
|
|
167 |
# TODO batch all items together
|
168 |
-
for item in datasets.logging.tqdm(
|
169 |
result_single = self._compute_single(item, tokenizer, tokenizer_kwargs,
|
170 |
model, device)
|
171 |
|
172 |
-
|
173 |
-
|
|
|
|
|
|
|
174 |
|
175 |
return results
|
176 |
|
|
|
78 |
|
79 |
|
80 |
SUITE_DATASET_SPEC = {
|
81 |
+
"suite_name": datasets.Value("string"),
|
82 |
"item_number": datasets.Value("int32"),
|
83 |
"conditions": datasets.Sequence(SUITE_DATASET_CONDITION_SPEC),
|
84 |
"predictions": datasets.Sequence(datasets.Value("string")),
|
85 |
}
|
86 |
|
87 |
|
88 |
+
class SyntaxGymMetricSuiteResult(TypedDict):
|
89 |
+
"""
|
90 |
+
Evaluation results for a single suite.
|
91 |
+
"""
|
92 |
+
suite_name: str
|
93 |
prediction_results: List[List[bool]]
|
94 |
region_totals: List[Dict[Tuple[str, int], float]]
|
95 |
|
96 |
|
97 |
+
SyntaxGymMetricResult = Dict[str, SyntaxGymMetricSuiteResult]
|
98 |
+
|
99 |
+
|
100 |
def prepare_tokenizer(model, batch_size, add_start_token=True) -> Tuple[PreTrainedTokenizer, Dict]:
|
101 |
"""
|
102 |
Load and prepare a tokenizer for SyntaxGym evaluation.
|
|
|
145 |
def _info(self):
|
146 |
seq = datasets.Sequence
|
147 |
features = datasets.Features({
|
148 |
+
"dataset": SUITE_DATASET_SPEC
|
149 |
})
|
150 |
return evaluate.EvaluationModuleInfo(
|
151 |
module_type="metric",
|
|
|
157 |
codebase_urls=["https://github.com/cpllab/syntaxgym-core"],
|
158 |
)
|
159 |
|
160 |
+
def _compute(self, dataset, model_id, batch_size=8, add_start_token=False, device=None) -> SyntaxGymMetricResult:
|
161 |
if device is not None:
|
162 |
assert device in ["gpu", "cpu", "cuda"]
|
163 |
if device == "gpu":
|
|
|
171 |
|
172 |
tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
|
173 |
|
174 |
+
results = {}
|
175 |
+
result_keys = ["prediction_results", "region_totals"]
|
176 |
# TODO batch all items together
|
177 |
+
for item in datasets.logging.tqdm(dataset):
|
178 |
result_single = self._compute_single(item, tokenizer, tokenizer_kwargs,
|
179 |
model, device)
|
180 |
|
181 |
+
suite_name = item["suite_name"]
|
182 |
+
if suite_name not in results:
|
183 |
+
results[suite_name] = {k: [] for k in result_keys}
|
184 |
+
for k in result_keys:
|
185 |
+
results[suite_name][k].append(result_single[k])
|
186 |
|
187 |
return results
|
188 |
|
test.py
CHANGED
@@ -495,20 +495,22 @@ def test_gpt_subordination_region_totals(syntaxgym_metric):
|
|
495 |
implementation, using the same underlying `gpt2` model.
|
496 |
"""
|
497 |
|
498 |
-
|
499 |
-
|
|
|
500 |
|
|
|
501 |
from pprint import pprint
|
502 |
-
pprint(
|
503 |
pprint(GPT2_SUBORDINATION_SRC_REFERENCE[0])
|
504 |
|
505 |
-
keys =
|
506 |
assert set(keys) == set(GPT2_SUBORDINATION_SRC_REFERENCE[0].keys())
|
507 |
|
508 |
-
result_ndarray = np.concatenate([np.array([
|
509 |
-
for
|
510 |
-
reference_ndarray = np.concatenate([np.array([
|
511 |
-
for
|
512 |
pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
|
513 |
key=lambda x: -x[1]))
|
514 |
np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)
|
|
|
495 |
implementation, using the same underlying `gpt2` model.
|
496 |
"""
|
497 |
|
498 |
+
suite_name = "subordination_src-src"
|
499 |
+
dataset = datasets.load_dataset("cpllab/syntaxgym", suite_name)
|
500 |
+
result = syntaxgym_metric.compute(dataset=dataset["test"], model_id="gpt2")
|
501 |
|
502 |
+
region_totals = result[suite_name]["region_totals"]
|
503 |
from pprint import pprint
|
504 |
+
pprint(region_totals[0])
|
505 |
pprint(GPT2_SUBORDINATION_SRC_REFERENCE[0])
|
506 |
|
507 |
+
keys = region_totals[0].keys()
|
508 |
assert set(keys) == set(GPT2_SUBORDINATION_SRC_REFERENCE[0].keys())
|
509 |
|
510 |
+
result_ndarray = np.concatenate([np.array([region_totals_i[key] for key in keys])
|
511 |
+
for region_totals_i in region_totals])
|
512 |
+
reference_ndarray = np.concatenate([np.array([region_totals_i[key] for key in keys])
|
513 |
+
for region_totals_i in GPT2_SUBORDINATION_SRC_REFERENCE])
|
514 |
pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
|
515 |
key=lambda x: -x[1]))
|
516 |
np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)
|