Spaces:

cpllab
/

syntaxgym

Sleeping

App Files Files Community

jgauthier commited on Jul 8, 2022

Commit

5cd2907

•

1 Parent(s): 092c6b1

update metric to support new dataset repr, and evaluating multiple suites in one call

Browse files

Files changed (3) hide show

README.md +24 -7
syntaxgym.py +19 -7
test.py +10 -8

README.md CHANGED Viewed

@@ -27,23 +27,40 @@ The metric takes a SyntaxGym test suite as input, as well as the name of the mod
 ```python
 import datasets
 import evaluate
 dataset = datasets.load_dataset("cpllab/syntaxgym", "subordination_src-src")
 metric = evaluate.load("cpllab/syntaxgym")
-result = metric.compute(suite=dataset["test"], model_id="gpt2")
 # Compute suite accuracy. Mean success over items, where "success" is the conjunction
 # of all boolean prediction results.
-suite_accuracy = result["prediction_results"].all(axis=1).mean(axis=0)
 ```
 ### Run the entire SyntaxGym dataset
-TODO
 ### Inputs
-- **suite** (`Dataset`): SyntaxGym test suite, represented as a Huggingface dataset. See the [dataset reference][syntaxgym-dataset].
 - **model_id** (str): Model used to calculate probabilities of each word. (This is only well defined for causal language models. This includes models such as `gpt2`, causal variations of BERT, causal versions of T5, and more. The full list can be found in the [`AutoModelForCausalLM` documentation][causal].)
 - **batch_size** (int): Maximum batch size for computations
 - **add_start_token** (bool): whether to add the start token to each sentence. Defaults to `True`.
@@ -51,15 +68,15 @@ TODO
 ### Output Values
-The metric returns a dict with two entries:
 - **prediction_results** (`List[List[bool]]`): For each item in the test suite, a list of booleans indicating whether each corresponding prediction came out `True`. Typically these are combined to yield an accuracy score (see example usage above).
 - **region_totals** (`List[Dict[Tuple[str, int], float]`): For each item, a mapping from individual region (keys `(<condition_name>, <region_number>)`) to the float-valued total surprisal for tokens in this region. This is useful for visualization, or if you'd like to use the aggregate surprisal data for other tasks (e.g. reading time prediction or neural activity prediction).
 ```python
->>> print(result["prediction_results"][0])
 [True]
->>> print(result["region_totals"][0])
 {('sub_no-matrix', 1): 14.905603408813477,
  ('sub_no-matrix', 2): 39.063140869140625,
  ('sub_no-matrix', 3): 26.862628936767578,

 ```python
 import datasets
 import evaluate
+import numpy as np
 dataset = datasets.load_dataset("cpllab/syntaxgym", "subordination_src-src")
 metric = evaluate.load("cpllab/syntaxgym")
+result = metric.compute(dataset=dataset["test"], model_id="gpt2")
 # Compute suite accuracy. Mean success over items, where "success" is the conjunction
 # of all boolean prediction results.
+suite_accuracy = np.array(result["subordination_src-src"]["prediction_results"]).all(axis=1).mean(axis=0)
 ```
 ### Run the entire SyntaxGym dataset
+You can load and evaluate all suites at once by omitting the dataset configuration name (second argument):
+```python
+import datasets
+import evaluate
+import numpy as np
+dataset = datasets.load_dataset("cpllab/syntaxgym")
+metric = evaluate.load("cpllab/syntaxgym")
+result = metric.compute(dataset=dataset["test"], model_id="gpt2")
+# Compute suite accuracy. Mean success over items, where "success" is the conjunction
+# of all boolean prediction results.
+suite_accuracies = [np.array(suite_results["prediction_results"]).all(axis=1).mean(axis=0)
+                    for suite_results in result.values()]
+overall_accuracy = np.mean(suite_accuracies.values())
+```
 ### Inputs
+- **dataset** (`Dataset`): SyntaxGym test suite, represented as a Huggingface dataset. See the [dataset reference][syntaxgym-dataset].
 - **model_id** (str): Model used to calculate probabilities of each word. (This is only well defined for causal language models. This includes models such as `gpt2`, causal variations of BERT, causal versions of T5, and more. The full list can be found in the [`AutoModelForCausalLM` documentation][causal].)
 - **batch_size** (int): Maximum batch size for computations
 - **add_start_token** (bool): whether to add the start token to each sentence. Defaults to `True`.
 ### Output Values
+The metric returns a dict of dicts, mapping test suite names to test suite performance. Each inner dict has two entries:
 - **prediction_results** (`List[List[bool]]`): For each item in the test suite, a list of booleans indicating whether each corresponding prediction came out `True`. Typically these are combined to yield an accuracy score (see example usage above).
 - **region_totals** (`List[Dict[Tuple[str, int], float]`): For each item, a mapping from individual region (keys `(<condition_name>, <region_number>)`) to the float-valued total surprisal for tokens in this region. This is useful for visualization, or if you'd like to use the aggregate surprisal data for other tasks (e.g. reading time prediction or neural activity prediction).
 ```python
+>>> print(result["subordination_src-src"]["prediction_results"][0])
 [True]
+>>> print(result["subordination_src-src"]["region_totals"][0])
 {('sub_no-matrix', 1): 14.905603408813477,
  ('sub_no-matrix', 2): 39.063140869140625,
  ('sub_no-matrix', 3): 26.862628936767578,

syntaxgym.py CHANGED Viewed

@@ -78,17 +78,25 @@ SUITE_DATASET_CONDITION_SPEC = {
 SUITE_DATASET_SPEC = {
     "item_number": datasets.Value("int32"),
     "conditions": datasets.Sequence(SUITE_DATASET_CONDITION_SPEC),
     "predictions": datasets.Sequence(datasets.Value("string")),
 }
-class SyntaxGymMetricResult(TypedDict):
     prediction_results: List[List[bool]]
     region_totals: List[Dict[Tuple[str, int], float]]
 def prepare_tokenizer(model, batch_size, add_start_token=True) -> Tuple[PreTrainedTokenizer, Dict]:
     """
     Load and prepare a tokenizer for SyntaxGym evaluation.
@@ -137,7 +145,7 @@ class SyntaxGym(evaluate.EvaluationModule):
     def _info(self):
         seq = datasets.Sequence
         features = datasets.Features({
-            "suite": SUITE_DATASET_SPEC
         })
         return evaluate.EvaluationModuleInfo(
             module_type="metric",
@@ -149,7 +157,7 @@ class SyntaxGym(evaluate.EvaluationModule):
             codebase_urls=["https://github.com/cpllab/syntaxgym-core"],
         )
-    def _compute(self, suite, model_id, batch_size=8, add_start_token=False, device=None) -> SyntaxGymMetricResult:
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"]
             if device == "gpu":
@@ -163,14 +171,18 @@ class SyntaxGym(evaluate.EvaluationModule):
         tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
-        results = {"prediction_results": [], "region_totals": []}
         # TODO batch all items together
-        for item in datasets.logging.tqdm(suite):
             result_single = self._compute_single(item, tokenizer, tokenizer_kwargs,
                                                  model, device)
-            for k in ["prediction_results", "region_totals"]:
-                results[k].append(result_single[k])
         return results

 SUITE_DATASET_SPEC = {
+    "suite_name": datasets.Value("string"),
     "item_number": datasets.Value("int32"),
     "conditions": datasets.Sequence(SUITE_DATASET_CONDITION_SPEC),
     "predictions": datasets.Sequence(datasets.Value("string")),
 }
+class SyntaxGymMetricSuiteResult(TypedDict):
+    """
+    Evaluation results for a single suite.
+    """
+    suite_name: str
     prediction_results: List[List[bool]]
     region_totals: List[Dict[Tuple[str, int], float]]
+SyntaxGymMetricResult = Dict[str, SyntaxGymMetricSuiteResult]
 def prepare_tokenizer(model, batch_size, add_start_token=True) -> Tuple[PreTrainedTokenizer, Dict]:
     """
     Load and prepare a tokenizer for SyntaxGym evaluation.
     def _info(self):
         seq = datasets.Sequence
         features = datasets.Features({
+            "dataset": SUITE_DATASET_SPEC
         })
         return evaluate.EvaluationModuleInfo(
             module_type="metric",
             codebase_urls=["https://github.com/cpllab/syntaxgym-core"],
         )
+    def _compute(self, dataset, model_id, batch_size=8, add_start_token=False, device=None) -> SyntaxGymMetricResult:
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"]
             if device == "gpu":
         tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
+        results = {}
+        result_keys = ["prediction_results", "region_totals"]
         # TODO batch all items together
+        for item in datasets.logging.tqdm(dataset):
             result_single = self._compute_single(item, tokenizer, tokenizer_kwargs,
                                                  model, device)
+            suite_name = item["suite_name"]
+            if suite_name not in results:
+                results[suite_name] = {k: [] for k in result_keys}
+            for k in result_keys:
+                results[suite_name][k].append(result_single[k])
         return results

test.py CHANGED Viewed

@@ -495,20 +495,22 @@ def test_gpt_subordination_region_totals(syntaxgym_metric):
     implementation, using the same underlying `gpt2` model.
     """
-    dataset = datasets.load_dataset("cpllab/syntaxgym", "subordination_src-src")
-    result = syntaxgym_metric.compute(suite=dataset["test"], model_id="gpt2")
     from pprint import pprint
-    pprint(result["region_totals"][0])
     pprint(GPT2_SUBORDINATION_SRC_REFERENCE[0])
-    keys = result["region_totals"][0].keys()
     assert set(keys) == set(GPT2_SUBORDINATION_SRC_REFERENCE[0].keys())
-    result_ndarray = np.concatenate([np.array([region_totals[key] for key in keys])
-                                     for region_totals in result["region_totals"]])
-    reference_ndarray = np.concatenate([np.array([region_totals[key] for key in keys])
-                                        for region_totals in GPT2_SUBORDINATION_SRC_REFERENCE])
     pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
                   key=lambda x: -x[1]))
     np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)

     implementation, using the same underlying `gpt2` model.
     """
+    suite_name = "subordination_src-src"
+    dataset = datasets.load_dataset("cpllab/syntaxgym", suite_name)
+    result = syntaxgym_metric.compute(dataset=dataset["test"], model_id="gpt2")
+    region_totals = result[suite_name]["region_totals"]
     from pprint import pprint
+    pprint(region_totals[0])
     pprint(GPT2_SUBORDINATION_SRC_REFERENCE[0])
+    keys = region_totals[0].keys()
     assert set(keys) == set(GPT2_SUBORDINATION_SRC_REFERENCE[0].keys())
+    result_ndarray = np.concatenate([np.array([region_totals_i[key] for key in keys])
+                                     for region_totals_i in region_totals])
+    reference_ndarray = np.concatenate([np.array([region_totals_i[key] for key in keys])
+                                        for region_totals_i in GPT2_SUBORDINATION_SRC_REFERENCE])
     pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
                   key=lambda x: -x[1]))
     np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)