Text Generation
Transformers
PyTorch
English
llama
finance
llms
text-generation-inference

Loading the tokenizer fails due to infinite loop

#4
by ghbacct - opened

Hi,

When running the tokenizer loading code as per documentation I run into a recursion error.

RecursionError                            Traceback (most recent call last)
/tmp/ipykernel_510781/3877109626.py in ()
      2 from transformers import AutoTokenizer, AutoModelForCausalLM
      3 
----> 4 tokenizer = AutoTokenizer.from_pretrained("ChanceFocus/finma-7b-nlp")
      5 model = AutoModelForCausalLM.from_pretrained("ChanceFocus/finma-7b-nlp")
~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    689                     f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
    690                 )
--> 691             return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    692 
    693         # Otherwise we have to be creative.

~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
   1823                 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
   1824 
-> 1825         return cls._from_pretrained(
   1826             resolved_vocab_files,
   1827             pretrained_model_name_or_path,

~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
   1986         # Instantiate tokenizer.
   1987         try:
-> 1988             tokenizer = cls(*init_inputs, **init_kwargs)
   1989         except OSError:
   1990             raise OSError(

~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/models/llama/tokenization_llama_fast.py in __init__(self, vocab_file, tokenizer_file, clean_up_tokenization_spaces, unk_token, bos_token, eos_token, add_bos_token, add_eos_token, **kwargs)
    102         self._add_bos_token = add_bos_token
    103         self._add_eos_token = add_eos_token
--> 104         self.update_post_processor()
    105 
    106         self.vocab_file = vocab_file

~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/models/llama/tokenization_llama_fast.py in update_post_processor(self)
    109     def update_post_processor(self):
    110         bos = self.bos_token
--> 111         bos_token_id = self.bos_token_id
    112 
    113         eos = self.eos_token

~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in bos_token_id(self)
   1134         if self._bos_token is None:
   1135             return None
-> 1136         return self.convert_tokens_to_ids(self.bos_token)
   1137 
   1138     @property

~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py in convert_tokens_to_ids(self, tokens)
    248 
    249         if isinstance(tokens, str):
--> 250             return self._convert_token_to_id_with_added_voc(tokens)
    251 
    252         return [self._convert_token_to_id_with_added_voc(token) for token in tokens]

~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py in _convert_token_to_id_with_added_voc(self, token)
    255         index = self._tokenizer.token_to_id(token)
    256         if index is None:
--> 257             return self.unk_token_id
    258         return index
    259 

~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in unk_token_id(self)
   1153         if self._unk_token is None:
   1154             return None
-> 1155         return self.convert_tokens_to_ids(self.unk_token)
   1156 
   1157     @property

... last 3 frames repeated, from the frame below ...

~/Dev/finma-experiments/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py in convert_tokens_to_ids(self, tokens)
    248 
    249         if isinstance(tokens, str):
--> 250             return self._convert_token_to_id_with_added_voc(tokens)
    251 
    252         return [self._convert_token_to_id_with_added_voc(token) for token in tokens]

RecursionError: maximum recursion depth exceeded while getting the str of an object

Please help.

ChanceFocus Asset Management (Shanghai) Company org

Hi,

Firstly, I am grateful for your interest in our work.

You can follow the guidelines provided on the model card to load the model. Here's the Python code for your reference:

from transformers import LlamaTokenizer, LlamaForCausalLM

tokenizer = LlamaTokenizer.from_pretrained('ChanceFocus/finma-7b-nlp')
model = LlamaForCausalLM.from_pretrained('ChanceFocus/finma-7b-nlp', device_map='auto')

If executing this code doesn't resolve your issue, please don't hesitate to comment!

jiminHuang changed discussion status to closed

Sign up or log in to comment