LingoIITGN/ganga-1b · Problem with Tokenizer

Jul 15, 2024

can someone kindly help me with this error in tokenization
1 from transformers import AutoModelForCausalLM, AutoTokenizer
----> 3 tokenizer = AutoTokenizer.from_pretrained("LingoIITGN/ganga-1b")
4 model = AutoModelForCausalLM.from_pretrained("LingoIITGN/ganga-1b", device_map="auto")
6 input_text = "BCCI ने टी-20 वर्ल्ड कप के बीच जिम्बाब्वे सीरीज "

File ~/.local/lib/python3.9/site-packages/transformers/models/auto/tokenization_auto.py:768, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
764 if tokenizer_class is None:
765 raise ValueError(
766 f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
767 )
--> 768 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
770 # Otherwise we have to be creative.
771 # if model is an encoder decoder, the encoder tokenizer class is used by default
772 if isinstance(config, EncoderDecoderConfig):

File ~/.local/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:2024, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, *init_inputs, **kwargs)
2021 else:
2022 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2024 return cls._from_pretrained(
2025 resolved_vocab_files,
2026 pretrained_model_name_or_path,
2027 init_configuration,
2028 *init_inputs,
2029 token=token,
2030 cache_dir=cache_dir,
2031 local_files_only=local_files_only,
2032 _commit_hash=commit_hash,
2033 _is_local=is_local,
2034 **kwargs,
2035 )

File ~/.local/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:2256, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
2254 # Instantiate the tokenizer.
2255 try:
-> 2256 tokenizer = cls(*init_inputs, **init_kwargs)
2257 except OSError:
2258 raise OSError(
2259 "Unable to load vocabulary from file. "
2260 "Please check that the provided vocabulary is accessible and not corrupted."
2261 )

File ~/.local/lib/python3.9/site-packages/transformers/tokenization_utils_fast.py:111, in PreTrainedTokenizerFast.init(self, *args, **kwargs)
108 fast_tokenizer = copy.deepcopy(tokenizer_object)
109 elif fast_tokenizer_file is not None and not from_slow:
110 # We have a serialization from tokenizers which let us directly build the backend
--> 111 fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
112 elif slow_tokenizer is not None:
113 # We need to convert a slow tokenizer to build the backend
114 fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)

Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3

ikmeet changed discussion status to closed Jul 16, 2024

yash3056

Jul 27, 2024

update transformer