Jon Gauthier commited on
Commit
0d58633
1 Parent(s): dadceff

fall back to GPT2TokenizerFast for models which don't have a fast tokenizer (like OPT)

Browse files
Files changed (1) hide show
  1. syntaxgym.py +14 -1
syntaxgym.py CHANGED
@@ -14,16 +14,21 @@
14
  """TODO: Add a description here."""
15
 
16
  from collections import defaultdict
 
17
  from typing import List, Dict, Tuple, NamedTuple
18
 
19
  import datasets
20
  import evaluate
21
  import numpy as np
22
  import torch
23
- from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizer
 
 
24
 
25
  from .prediction import Prediction
26
 
 
 
27
 
28
  _CITATION = """\
29
  @inproceedings{Hu:et-al:2020,
@@ -108,7 +113,15 @@ def prepare_tokenizer(model, batch_size, add_start_token=True) -> Tuple[PreTrain
108
  tokenizer:
109
  tokenizer_kwargs: suggested kwargs for any tokenizer calls
110
  """
 
111
  tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)
 
 
 
 
 
 
 
112
 
113
  # if batch_size > 1 (which generally leads to padding being required), and
114
  # if there is not an already assigned pad_token, assign an existing
 
14
  """TODO: Add a description here."""
15
 
16
  from collections import defaultdict
17
+ import logging
18
  from typing import List, Dict, Tuple, NamedTuple
19
 
20
  import datasets
21
  import evaluate
22
  import numpy as np
23
  import torch
24
+ from transformers import AutoTokenizer, AutoModelForCausalLM, \
25
+ PreTrainedTokenizer, PreTrainedTokenizerFast, \
26
+ GPT2TokenizerFast
27
 
28
  from .prediction import Prediction
29
 
30
+ L = logging.getLogger(__name__)
31
+
32
 
33
  _CITATION = """\
34
  @inproceedings{Hu:et-al:2020,
 
113
  tokenizer:
114
  tokenizer_kwargs: suggested kwargs for any tokenizer calls
115
  """
116
+
117
  tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)
118
+ if not isinstance(tokenizer, PreTrainedTokenizerFast):
119
+ # We need a fast tokenizer because these are the only tokenizers that support
120
+ # return_offsets_mapping. Try to use GPT2 tokenizer -- this is sufficient for
121
+ # OPT.
122
+ L.warning(f"The model {model.name_or_path} does not have a fast tokenizer, "
123
+ f"which is required for this metric. Running with GPT2 tokenizer.")
124
+ tokenizer = GPT2TokenizerFast.from_pretrained(model.name_or_path)
125
 
126
  # if batch_size > 1 (which generally leads to padding being required), and
127
  # if there is not an already assigned pad_token, assign an existing