Loading the tokenizer fails due to infinite loop
#4
by
ghbacct
- opened
Hi,
When running the tokenizer loading code as per documentation I run into a recursion error.
RecursionError Traceback (most recent call last)
/tmp/ipykernel_510781/3877109626.py in ()
2 from transformers import AutoTokenizer, AutoModelForCausalLM
3
----> 4 tokenizer = AutoTokenizer.from_pretrained("ChanceFocus/finma-7b-nlp")
5 model = AutoModelForCausalLM.from_pretrained("ChanceFocus/finma-7b-nlp")
~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
689 f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
690 )
--> 691 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
692
693 # Otherwise we have to be creative.
~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1823 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
1824
-> 1825 return cls._from_pretrained(
1826 resolved_vocab_files,
1827 pretrained_model_name_or_path,
~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
1986 # Instantiate tokenizer.
1987 try:
-> 1988 tokenizer = cls(*init_inputs, **init_kwargs)
1989 except OSError:
1990 raise OSError(
~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/models/llama/tokenization_llama_fast.py in __init__(self, vocab_file, tokenizer_file, clean_up_tokenization_spaces, unk_token, bos_token, eos_token, add_bos_token, add_eos_token, **kwargs)
102 self._add_bos_token = add_bos_token
103 self._add_eos_token = add_eos_token
--> 104 self.update_post_processor()
105
106 self.vocab_file = vocab_file
~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/models/llama/tokenization_llama_fast.py in update_post_processor(self)
109 def update_post_processor(self):
110 bos = self.bos_token
--> 111 bos_token_id = self.bos_token_id
112
113 eos = self.eos_token
~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in bos_token_id(self)
1134 if self._bos_token is None:
1135 return None
-> 1136 return self.convert_tokens_to_ids(self.bos_token)
1137
1138 @property
~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py in convert_tokens_to_ids(self, tokens)
248
249 if isinstance(tokens, str):
--> 250 return self._convert_token_to_id_with_added_voc(tokens)
251
252 return [self._convert_token_to_id_with_added_voc(token) for token in tokens]
~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py in _convert_token_to_id_with_added_voc(self, token)
255 index = self._tokenizer.token_to_id(token)
256 if index is None:
--> 257 return self.unk_token_id
258 return index
259
~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in unk_token_id(self)
1153 if self._unk_token is None:
1154 return None
-> 1155 return self.convert_tokens_to_ids(self.unk_token)
1156
1157 @property
... last 3 frames repeated, from the frame below ...
~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py in convert_tokens_to_ids(self, tokens)
248
249 if isinstance(tokens, str):
--> 250 return self._convert_token_to_id_with_added_voc(tokens)
251
252 return [self._convert_token_to_id_with_added_voc(token) for token in tokens]
RecursionError: maximum recursion depth exceeded while getting the str of an object
Please help.
Hi,
Firstly, I am grateful for your interest in our work.
You can follow the guidelines provided on the model card to load the model. Here's the Python code for your reference:
from transformers import LlamaTokenizer, LlamaForCausalLM
tokenizer = LlamaTokenizer.from_pretrained('ChanceFocus/finma-7b-nlp')
model = LlamaForCausalLM.from_pretrained('ChanceFocus/finma-7b-nlp', device_map='auto')
If executing this code doesn't resolve your issue, please don't hesitate to comment!
jiminHuang
changed discussion status to
closed