xgen-7b-8k-base model tokenizer has a problem: AttributeError: 'XgenTokenizer' object has no attribute 'encoder'

#29
by awesomenes - opened

AttributeError Traceback (most recent call last)
Cell In[17], line 3
1 pretrained_model_name = "./models/xgen-7b-8k-base"
2 # model = AutoModelForCausalLM.from_pretrained(pretrained_model_name, torch_dtype=torch.bfloat16)
----> 3 tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name, trust_remote_code=True)
4 # model

File ~/miniconda3/envs/fine-tuning/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:738, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
736 if os.path.isdir(pretrained_model_name_or_path):
737 tokenizer_class.register_for_auto_class()
--> 738 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
739 elif config_tokenizer_class is not None:
740 tokenizer_class = None

File ~/miniconda3/envs/fine-tuning/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2045, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, *init_inputs, **kwargs)
2042 else:
2043 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2045 return cls._from_pretrained(
2046 resolved_vocab_files,
2047 pretrained_model_name_or_path,
2048 init_configuration,
2049 *init_inputs,
2050 token=token,
2051 cache_dir=cache_dir,
2052 local_files_only=local_files_only,
2053 _commit_hash=commit_hash,
2054 _is_local=is_local,
2055 **kwargs,
2056 )

File ~/miniconda3/envs/fine-tuning/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2256, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
2254 # Instantiate the tokenizer.
2255 try:
-> 2256 tokenizer = cls(*init_inputs, **init_kwargs)
2257 except OSError:
2258 raise OSError(
2259 "Unable to load vocabulary from file. "
2260 "Please check that the provided vocabulary is accessible and not corrupted."
2261 )

File ~/.cache/huggingface/modules/transformers_modules/xgen-7b-8k-base/tokenization_xgen.py:137, in XgenTokenizer.init(self, pad_token, eos_token, add_eos_token, add_special_tokens, **kwargs)
135 pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
136 eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
--> 137 super().init(
138 pad_token=pad_token_added,
139 eos_token=eos_token_added,
140 add_eos_token=add_eos_token,
141 add_special_tokens=add_special_tokens,
142 **kwargs,
143 )
144 self.add_eos_token = add_eos_token
145 self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)

File ~/miniconda3/envs/fine-tuning/lib/python3.10/site-packages/transformers/tokenization_utils.py:366, in PreTrainedTokenizer.init(self, **kwargs)
362 self._added_tokens_encoder: Dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()}
364 # 4. If some of the special tokens are not part of the vocab, we add them, at the end.
365 # the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following tokenizers
--> 366 self._add_tokens(self.all_special_tokens_extended, special_tokens=True)
368 self._decode_use_source_tokenizer = False

File ~/miniconda3/envs/fine-tuning/lib/python3.10/site-packages/transformers/tokenization_utils.py:462, in PreTrainedTokenizer._add_tokens(self, new_tokens, special_tokens)
460 if new_tokens is None:
461 return added_tokens
--> 462 current_vocab = self.get_vocab().copy()
463 new_idx = len(current_vocab) # only call this once, len gives the last index + 1
464 for token in new_tokens:

File ~/.cache/huggingface/modules/transformers_modules/xgen-7b-8k-base/tokenization_xgen.py:154, in XgenTokenizer.get_vocab(self)
152 def get_vocab(self):
153 """Returns vocab as a dict"""
--> 154 vocab = {self.encoder.decode_single_token_bytes(i): i for i in range(self.vocab_size)}
155 return vocab

File ~/.cache/huggingface/modules/transformers_modules/xgen-7b-8k-base/tokenization_xgen.py:150, in XgenTokenizer.vocab_size(self)
147 @property
148 def vocab_size(self):
149 """Returns vocab size"""
--> 150 return self.encoder.n_vocab

AttributeError: 'XgenTokenizer' object has no attribute 'encoder'

Maybe the version of transformers is too high, and "pip install transformers==4.30.0" can deal with this problem.

cc @ArthurZ I think this is fixed in the latest transformers indeed

pip install -U transformers

Hi everyone, please clear the hf cache, and retry.

Sign up or log in to comment