Problems with tokenizer

#3
by do-me - opened

I tried to run this model on my CUDA GPU but get this error when running the sample code. Do you have any ideas why?

from transformers import AutoTokenizer, AutoModelWithLMHead, TranslationPipeline

pipeline = TranslationPipeline(
model=AutoModelWithLMHead.from_pretrained("SEBIS/legal_t5_small_trans_it_en_small_finetuned"),
tokenizer=AutoTokenizer.from_pretrained(pretrained_model_name_or_path = "SEBIS/legal_t5_small_trans_it_en", do_lower_case=False, 
                                            skip_special_tokens=True),
    device=0
)

it_text = "Supplenti presenti al momento della votazione finale"

pipeline([it_text], max_length=512)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_2819/3211885403.py in <module>
      3 pipeline = TranslationPipeline(
      4 model=AutoModelWithLMHead.from_pretrained("SEBIS/legal_t5_small_trans_it_en_small_finetuned"),
----> 5 tokenizer=AutoTokenizer.from_pretrained(pretrained_model_name_or_path = "SEBIS/legal_t5_small_trans_it_en", do_lower_case=False, 
      6                                             skip_special_tokens=True),
      7     device=0

~/.local/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    657             tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
    658             if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 659                 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    660             else:
    661                 if tokenizer_class_py is not None:

~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
   1799                 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
   1800 
-> 1801         return cls._from_pretrained(
   1802             resolved_vocab_files,
   1803             pretrained_model_name_or_path,

~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, *init_inputs, **kwargs)
   1954         # Instantiate tokenizer.
   1955         try:
-> 1956             tokenizer = cls(*init_inputs, **init_kwargs)
   1957         except OSError:
   1958             raise OSError(

~/.local/lib/python3.8/site-packages/transformers/models/t5/tokenization_t5_fast.py in __init__(self, vocab_file, tokenizer_file, eos_token, unk_token, pad_token, extra_ids, additional_special_tokens, **kwargs)
    131                 )
    132 
--> 133         super().__init__(
    134             vocab_file,
    135             tokenizer_file=tokenizer_file,

~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py in __init__(self, *args, **kwargs)
    112         elif slow_tokenizer is not None:
    113             # We need to convert a slow tokenizer to build the backend
--> 114             fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
    115         elif self.slow_tokenizer_class is not None:
    116             # We need to create and convert a slow tokenizer to build the backend

~/.local/lib/python3.8/site-packages/transformers/convert_slow_tokenizer.py in convert_slow_tokenizer(transformer_tokenizer)
   1160     converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name]
   1161 
-> 1162     return converter_class(transformer_tokenizer).converted()

~/.local/lib/python3.8/site-packages/transformers/convert_slow_tokenizer.py in __init__(self, *args)
    436         super().__init__(*args)
    437 
--> 438         from .utils import sentencepiece_model_pb2 as model_pb2
    439 
    440         m = model_pb2.ModelProto()

~/.local/lib/python3.8/site-packages/transformers/utils/sentencepiece_model_pb2.py in <module>
     90     create_key=_descriptor._internal_create_key,
     91     values=[
---> 92         _descriptor.EnumValueDescriptor(
     93             name="UNIGRAM",
     94             index=0,

~/.local/lib/python3.8/site-packages/google/protobuf/descriptor.py in __new__(cls, name, index, number, type, options, serialized_options, create_key)
    794                 type=None,  # pylint: disable=redefined-builtin
    795                 options=None, serialized_options=None, create_key=None):
--> 796       _message.Message._CheckCalledFromGeneratedFile()
    797       # There is no way we can build a complete EnumValueDescriptor with the
    798       # given parameters (the name of the Enum is not known, for example).

TypeError: Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

Sign up or log in to comment