BAAI/bge-reranker-base · Error " Couldn't instantiate the backend tokenizer from one of ... "

Dec 11, 2023

•

edited Dec 11, 2023

Hi,

I'm trying to run command below, but I keep running into error.

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-base")

I tried cloning the entire repo and load the tokenizer file manually locally, yet the same error occurs. Can anyone please help troubleshoot? Does it require any specific "transformers" version? My version is 4.34.1

Here is the error.

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[61], line 2
      1 from transformers import AutoTokenizer
----> 2 tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-base")

File ~/miniconda3/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py:751, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    747     if tokenizer_class is None:
    748         raise ValueError(
    749             f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
    750         )
--> 751     return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    753 # Otherwise we have to be creative.
    754 # if model is an encoder decoder, the encoder tokenizer class is used by default
    755 if isinstance(config, EncoderDecoderConfig):

File ~/miniconda3/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2017, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, *init_inputs, **kwargs)
   2014     else:
   2015         logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2017 return cls._from_pretrained(
   2018     resolved_vocab_files,
   2019     pretrained_model_name_or_path,
   2020     init_configuration,
   2021     *init_inputs,
   2022     token=token,
   2023     cache_dir=cache_dir,
   2024     local_files_only=local_files_only,
   2025     _commit_hash=commit_hash,
   2026     _is_local=is_local,
   2027     **kwargs,
   2028 )

File ~/miniconda3/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2249, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
   2247 # Instantiate the tokenizer.
   2248 try:
-> 2249     tokenizer = cls(*init_inputs, **init_kwargs)
   2250 except OSError:
   2251     raise OSError(
   2252         "Unable to load vocabulary from file. "
   2253         "Please check that the provided vocabulary is accessible and not corrupted."
   2254     )

File ~/miniconda3/lib/python3.11/site-packages/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py:155, in XLMRobertaTokenizerFast.__init__(self, vocab_file, tokenizer_file, bos_token, eos_token, sep_token, cls_token, unk_token, pad_token, mask_token, **kwargs)
    139 def __init__(
    140     self,
    141     vocab_file=None,
   (...)
    151 ):
    152     # Mask token behave like a normal word, i.e. include the space before it
    153     mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
--> 155     super().__init__(
    156         vocab_file,
    157         tokenizer_file=tokenizer_file,
    158         bos_token=bos_token,
    159         eos_token=eos_token,
    160         sep_token=sep_token,
    161         cls_token=cls_token,
    162         unk_token=unk_token,
    163         pad_token=pad_token,
    164         mask_token=mask_token,
    165         **kwargs,
    166     )
    168     self.vocab_file = vocab_file

File ~/miniconda3/lib/python3.11/site-packages/transformers/tokenization_utils_fast.py:120, in PreTrainedTokenizerFast.__init__(self, *args, **kwargs)
    118     fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
    119 else:
--> 120     raise ValueError(
    121         "Couldn't instantiate the backend tokenizer from one of: \n"
    122         "(1) a `tokenizers` library serialization file, \n"
    123         "(2) a slow tokenizer instance to convert or \n"
    124         "(3) an equivalent slow tokenizer class to instantiate and convert. \n"
    125         "You need to have sentencepiece installed to convert a slow tokenizer to a fast one."
    126     )
    128 self._tokenizer = fast_tokenizer
    130 if slow_tokenizer is not None:

ValueError: Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.

Shitao

Beijing Academy of Artificial Intelligence org Dec 12, 2023

Hi, we have not encountered this error, and transformers=4.34.1 also works well.
According to the error information, you can try to install the sentencepiece package and run this command again.

lucashw

Dec 12, 2023

I did install sentencepiece with its latest version, and it didn't help. At a total loss now ...