from transformers.models.auto.tokenization_auto import get_class_from_dynamic_module from transformers.tokenization_utils import AddedToken CodeGen25Tokenizer = get_class_from_dynamic_module("tokenization_codegen25.CodeGen25Tokenizer", "Salesforce/codegen25-7b-multi") tiktoken_tokenizer = get_class_from_dynamic_module("tokenization_codegen25.tiktoken_tokenizer", "Salesforce/codegen25-7b-multi") class DeciCoderTokenizer(CodeGen25Tokenizer): def __init__( self, pad_token=None, eos_token="<|endoftext|>", add_eos_token=False, add_special_tokens=True, **kwargs, ): self.add_eos_token = add_eos_token self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens) pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token super().__init__( pad_token=pad_token_added, eos_token=eos_token_added, add_eos_token=add_eos_token, add_special_tokens=add_special_tokens, **kwargs, ) def _convert_id_to_token(self, index): try: return super()._convert_id_to_token(index) except: return None