Deci
/

Text Generation
Transformers
Safetensors
deci
custom_code
DeciCoder-6B / tokenization_decicoder.py
NajeebDeci's picture
tokenizer files
e084f01
raw
history blame
1.52 kB
from transformers.models.auto.tokenization_auto import get_class_from_dynamic_module
from transformers.tokenization_utils import AddedToken
CodeGen25Tokenizer = get_class_from_dynamic_module("tokenization_codegen25.CodeGen25Tokenizer",
"Salesforce/codegen25-7b-multi")
tiktoken_tokenizer = get_class_from_dynamic_module("tokenization_codegen25.tiktoken_tokenizer",
"Salesforce/codegen25-7b-multi")
class DeciCoderTokenizer(CodeGen25Tokenizer):
def __init__(
self,
pad_token=None,
eos_token="<|endoftext|>",
add_eos_token=False,
add_special_tokens=True,
**kwargs,
):
self.add_eos_token = add_eos_token
self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
super().__init__(
pad_token=pad_token_added,
eos_token=eos_token_added,
add_eos_token=add_eos_token,
add_special_tokens=add_special_tokens,
**kwargs,
)
def _convert_id_to_token(self, index):
try:
return super()._convert_id_to_token(index)
except:
return None