tokenizer = AutoTokenizer.from_pretrained("ehartford/dolphin-2.1-mistral-7b") results in unk error related to tokens greater than 32000

#13
by LaferriereJC - opened

ValueError Traceback (most recent call last)
Cell In[57], line 1
----> 1 tokenizer = AutoTokenizer.from_pretrained("ehartford/dolphin-2.1-mistral-7b")

File H:\py310-venv\lib\site-packages\transformers\models\auto\tokenization_auto.py:694, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
690 if tokenizer_class is None:
691 raise ValueError(
692 f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
693 )
--> 694 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
696 # Otherwise we have to be creative.
697 # if model is an encoder decoder, the encoder tokenizer class is used by default
698 if isinstance(config, EncoderDecoderConfig):

File H:\py310-venv\lib\site-packages\transformers\tokenization_utils_base.py:1812, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1809 else:
1810 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 1812 return cls._from_pretrained(
1813 resolved_vocab_files,
1814 pretrained_model_name_or_path,
1815 init_configuration,
1816 *init_inputs,
1817 use_auth_token=use_auth_token,
1818 cache_dir=cache_dir,
1819 local_files_only=local_files_only,
1820 _commit_hash=commit_hash,
1821 _is_local=is_local,
1822 **kwargs,
1823 )

File H:\py310-venv\lib\site-packages\transformers\tokenization_utils_base.py:1844, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
1842 has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
1843 if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
-> 1844 slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
1845 copy.deepcopy(resolved_vocab_files),
1846 pretrained_model_name_or_path,
1847 copy.deepcopy(init_configuration),
1848 *init_inputs,
1849 use_auth_token=use_auth_token,
1850 cache_dir=cache_dir,
1851 local_files_only=local_files_only,
1852 _commit_hash=_commit_hash,
1853 **(copy.deepcopy(kwargs)),
1854 )
1855 else:
1856 slow_tokenizer = None

File H:\py310-venv\lib\site-packages\transformers\tokenization_utils_base.py:2031, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
2024 raise ValueError(
2025 f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
2026 f"{index}."
2027 )
2028 elif not has_tokenizer_file and index != current_index:
2029 # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
2030 # current length of the tokenizer.
-> 2031 raise ValueError(
2032 f"Non-consecutive added token '{token}' found. "
2033 f"Should have index {current_index} but has index {index} in saved vocabulary."
2034 )
2036 is_special = bool(token in special_tokens)
2037 if is_last_special is None or is_last_special == is_special:

ValueError: Non-consecutive added token '' found. Should have index 32000 but has index 0 in saved vocabulary.

I get this same error trying to deploy on hugginface endpoints

Sign up or log in to comment