jgauthier commited on
Commit
4bd2962
1 Parent(s): 3427db2

add tokenizer config from perplexity metric. truncation breaks tests

Browse files
Files changed (1) hide show
  1. syntaxgym.py +49 -11
syntaxgym.py CHANGED
@@ -21,7 +21,7 @@ import datasets
21
  import evaluate
22
  import numpy as np
23
  import torch
24
- from transformers import AutoTokenizer, AutoModelForCausalLM
25
 
26
  from .prediction import Prediction
27
 
@@ -89,6 +89,46 @@ class SyntaxGymMetricResult(TypedDict):
89
  region_totals: List[Dict[Tuple[str, int], float]]
90
 
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
93
  class SyntaxGym(evaluate.EvaluationModule):
94
  """
@@ -110,7 +150,7 @@ class SyntaxGym(evaluate.EvaluationModule):
110
  codebase_urls=["https://github.com/cpllab/syntaxgym-core"],
111
  )
112
 
113
- def _compute(self, suite, model_id, device=None) -> SyntaxGymMetricResult:
114
  if device is not None:
115
  assert device in ["gpu", "cpu", "cuda"]
116
  if device == "gpu":
@@ -122,31 +162,31 @@ class SyntaxGym(evaluate.EvaluationModule):
122
  model = model.to(device)
123
  model.eval()
124
 
125
- tokenizer = AutoTokenizer.from_pretrained(model_id)
126
- # TODO copy from perplexity metric
127
- tokenizer.pad_token = tokenizer.eos_token
128
 
129
  results = {"prediction_results": [], "region_totals": []}
130
  # TODO batch all items together
131
  for item in datasets.logging.tqdm(suite):
132
- result_single = self._compute_single(item, tokenizer, model, device)
 
133
 
134
  for k in ["prediction_results", "region_totals"]:
135
  results[k].append(result_single[k])
136
 
137
  return results
138
 
139
- def _compute_single(self, item, tokenizer, model, device):
140
  tokenized = tokenizer(item["conditions"]["content"],
141
- padding=True,
142
  return_tensors="pt",
143
- return_offsets_mapping=True).to(device)
 
144
 
145
  # input_ids: B * T
146
  input_ids = tokenized["input_ids"]
147
  assert input_ids.ndim == 2
148
 
149
  # Compute sentence level surprisals.
 
150
  with torch.no_grad():
151
  # Pre-softmax predictive distribution B * T * V
152
  logits = model(input_ids).logits
@@ -164,8 +204,6 @@ class SyntaxGym(evaluate.EvaluationModule):
164
  # reindexed surprisals: B * (T - 1)
165
  surprisals = torch.gather(surps_shifted, 2, expected_ids.unsqueeze(2)) \
166
  .squeeze(2)
167
- # This is the original, which works but not with multiple axes in expected_ids
168
- # surprisals = surps_shifted[range(surps_shifted.shape[0]), expected_ids]
169
 
170
  # surprisals is now B * (T - 1)
171
 
 
21
  import evaluate
22
  import numpy as np
23
  import torch
24
+ from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizer
25
 
26
  from .prediction import Prediction
27
 
 
89
  region_totals: List[Dict[Tuple[str, int], float]]
90
 
91
 
92
+ def prepare_tokenizer(model, batch_size, add_start_token=True) -> Tuple[PreTrainedTokenizer, Dict]:
93
+ """
94
+ Load and prepare a tokenizer for SyntaxGym evaluation.
95
+
96
+ Returns:
97
+ tokenizer:
98
+ tokenizer_kwargs: suggested kwargs for any tokenizer calls
99
+ """
100
+ tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)
101
+
102
+ # if batch_size > 1 (which generally leads to padding being required), and
103
+ # if there is not an already assigned pad_token, assign an existing
104
+ # special token to also be the padding token
105
+ if tokenizer.pad_token is None and batch_size > 1:
106
+ existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
107
+ # check that the model already has at least one special token defined
108
+ assert (
109
+ len(existing_special_tokens) > 0
110
+ ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
111
+ # assign one of the special tokens to also be the pad token
112
+ tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
113
+
114
+ if add_start_token:
115
+ # leave room for <BOS> token to be added:
116
+ assert (
117
+ tokenizer.bos_token is not None
118
+ ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
119
+ max_tokenized_len = model.config.max_length - 1
120
+ else:
121
+ max_tokenized_len = model.config.max_length
122
+
123
+ tokenizer_kwargs = {
124
+ "add_special_tokens": False,
125
+ "padding": True,
126
+ "truncation": True,
127
+ "max_length": max_tokenized_len
128
+ }
129
+ return tokenizer, tokenizer_kwargs
130
+
131
+
132
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
133
  class SyntaxGym(evaluate.EvaluationModule):
134
  """
 
150
  codebase_urls=["https://github.com/cpllab/syntaxgym-core"],
151
  )
152
 
153
+ def _compute(self, suite, model_id, batch_size=8, add_start_token=False, device=None) -> SyntaxGymMetricResult:
154
  if device is not None:
155
  assert device in ["gpu", "cpu", "cuda"]
156
  if device == "gpu":
 
162
  model = model.to(device)
163
  model.eval()
164
 
165
+ tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
 
 
166
 
167
  results = {"prediction_results": [], "region_totals": []}
168
  # TODO batch all items together
169
  for item in datasets.logging.tqdm(suite):
170
+ result_single = self._compute_single(item, tokenizer, tokenizer_kwargs,
171
+ model, device)
172
 
173
  for k in ["prediction_results", "region_totals"]:
174
  results[k].append(result_single[k])
175
 
176
  return results
177
 
178
+ def _compute_single(self, item, tokenizer, tokenizer_kwargs, model, device):
179
  tokenized = tokenizer(item["conditions"]["content"],
 
180
  return_tensors="pt",
181
+ return_offsets_mapping=True,
182
+ **tokenizer_kwargs).to(device)
183
 
184
  # input_ids: B * T
185
  input_ids = tokenized["input_ids"]
186
  assert input_ids.ndim == 2
187
 
188
  # Compute sentence level surprisals.
189
+ # TODO support sentences which exceed truncation length
190
  with torch.no_grad():
191
  # Pre-softmax predictive distribution B * T * V
192
  logits = model(input_ids).logits
 
204
  # reindexed surprisals: B * (T - 1)
205
  surprisals = torch.gather(surps_shifted, 2, expected_ids.unsqueeze(2)) \
206
  .squeeze(2)
 
 
207
 
208
  # surprisals is now B * (T - 1)
209