model mrm8488/t5-base-finetuned-wikiSQL: AutoTokenizer .from_pretrained() error message

#5
by ahe61 - opened

Hi,

I run into an error while running your model

! pip install transformers
from transformers import AutoModelWithLMHead, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")

Error message
TypeError: Couldn't build proto file into descriptor pool: duplicate file name (sentencepiece_model.proto)

(and warnings
The xla_device argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your config.json file.
The xla_device argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your config.json file.
The xla_device argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your config.json file.)

it does work with standard bert models (or no model at all)

I run python3.10

best,

Andreas

full dump:
Cell In[4], line 3
1 from transformers import AutoModelWithLMHead, AutoTokenizer
----> 3 tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
4 #model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")

File ~/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:659, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
657 tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
658 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 659 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
660 else:
661 if tokenizer_class_py is not None:

File ~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1801, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1798 else:
1799 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 1801 return cls._from_pretrained(
1802 resolved_vocab_files,
1803 pretrained_model_name_or_path,
1804 init_configuration,
1805 *init_inputs,
1806 use_auth_token=use_auth_token,
1807 cache_dir=cache_dir,
1808 local_files_only=local_files_only,
1809 _commit_hash=commit_hash,
1810 **kwargs,
1811 )

File ~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1956, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, *init_inputs, **kwargs)
1954 # Instantiate tokenizer.
1955 try:
-> 1956 tokenizer = cls(*init_inputs, **init_kwargs)
1957 except OSError:
1958 raise OSError(
1959 "Unable to load vocabulary from file. "
1960 "Please check that the provided vocabulary is accessible and not corrupted."
1961 )

File ~/.local/lib/python3.10/site-packages/transformers/models/t5/tokenization_t5_fast.py:133, in T5TokenizerFast.init(self, vocab_file, tokenizer_file, eos_token, unk_token, pad_token, extra_ids, additional_special_tokens, **kwargs)
126 if extra_tokens != extra_ids:
127 raise ValueError(
128 f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
129 " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
130 " tokens"
131 )
--> 133 super().init(
134 vocab_file,
135 tokenizer_file=tokenizer_file,
136 eos_token=eos_token,
137 unk_token=unk_token,
138 pad_token=pad_token,
139 extra_ids=extra_ids,
140 additional_special_tokens=additional_special_tokens,
141 **kwargs,
142 )
144 self.vocab_file = vocab_file
145 self.can_save_slow_tokenizer = False if not self.vocab_file else True

File ~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py:114, in PreTrainedTokenizerFast.init(self, *args, **kwargs)
111 fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
112 elif slow_tokenizer is not None:
113 # We need to convert a slow tokenizer to build the backend
--> 114 fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
115 elif self.slow_tokenizer_class is not None:
116 # We need to create and convert a slow tokenizer to build the backend
117 slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)

File ~/.local/lib/python3.10/site-packages/transformers/convert_slow_tokenizer.py:1162, in convert_slow_tokenizer(transformer_tokenizer)
1154 raise ValueError(
1155 f"An instance of tokenizer class {tokenizer_class_name} cannot be converted in a Fast tokenizer instance."
1156 " No converter was found. Currently available slow->fast convertors:"
1157 f" {list(SLOW_TO_FAST_CONVERTERS.keys())}"
1158 )
1160 converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name]
-> 1162 return converter_class(transformer_tokenizer).converted()

File ~/.local/lib/python3.10/site-packages/transformers/convert_slow_tokenizer.py:438, in SpmConverter.init(self, *args)
434 requires_backends(self, "protobuf")
436 super().init(*args)
--> 438 from .utils import sentencepiece_model_pb2 as model_pb2
440 m = model_pb2.ModelProto()
441 with open(self.original_tokenizer.vocab_file, "rb") as f:

File ~/.local/lib/python3.10/site-packages/transformers/utils/sentencepiece_model_pb2.py:29
24 # @@protoc_insertion_point(imports)
26 _sym_db = _symbol_database.Default()
---> 29 DESCRIPTOR = _descriptor.FileDescriptor(
30 name="sentencepiece_model.proto",
31 package="sentencepiece",
32 syntax="proto2",
33 serialized_options=b"H\003",
34 create_key=_descriptor._internal_create_key,
35 serialized_pb=(
36 b'\n\x19sentencepiece_model.proto\x12\rsentencepiece"\xa1\n\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01'
37 b" \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02"
38 b" \x01(\t\x12\x41\n\nmodel_type\x18\x03"
39 b" \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04"
40 b" \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12"
41 b' \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12"\n\x12\x63haracter_coverage\x18\n'
42 b" \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b"
43 b" \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12"
44 b' \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12"\n\x16training_sentence_size\x18\r'
45 b" \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e"
46 b" \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f"
47 b" \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12"
48 b" \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10"
49 b" \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11"
50 b" \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14"
51 b" \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15"
52 b" \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17"
53 b" \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16"
54 b" \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18"
55 b" \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19"
56 b" \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e"
57 b" \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$"
58 b" \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18"
59 b' \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18"'
60 b" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18)"
61 b" \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+"
62 b" \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05\x12\x16\n\tbos_piece\x18."
63 b" \x01(\t:\x03\x12\x17\n\teos_piece\x18/ \x01(\t:\x04\x12\x18\n\tpad_piece\x18\x30"
64 b" \x01(\t:\x05\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87"
65 b" \x12+\n\x1ctrain_extremely_large_corpus\x18\x31"
66 b' \x01(\x08:\x05\x66\x61lse"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01'
67 b" \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03"
68 b" \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12"
69 b" \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06"
70 b' \x01(\t
\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01'
71 b' \x03(\x0b\x32".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01'
72 b" \x01(\t\x12\x10\n\x08\x65xpected\x18\x02"
73 b' \x01(\t\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01'
74 b" \x03(\x0b\x32'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02"
75 b" \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03"
76 b" \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04"
77 b" \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05"
78 b" \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01"
79 b" \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03"
80 b' \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05
\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
81 ),
82 )
85 _TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor(
86 name="ModelType",
87 full_name="sentencepiece.TrainerSpec.ModelType",
(...)
128 serialized_end=1347,
129 )
130 _sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE)

File ~/.local/lib/python3.10/site-packages/google/protobuf/descriptor.py:1028, in FileDescriptor.new(cls, name, package, options, serialized_options, serialized_pb, dependencies, public_dependencies, syntax, pool, create_key)
1026 raise RuntimeError('Please link in cpp generated lib for %s' % (name))
1027 elif serialized_pb:
-> 1028 return _message.default_pool.AddSerializedFile(serialized_pb)
1029 else:
1030 return super(FileDescriptor, cls).new(cls)

TypeError: Couldn't build proto file into descriptor pool: duplicate file name (sentencepiece_model.proto)

Sign up or log in to comment