codeX-1.0 / tokenizer /train_tokenizer.py
dorkai's picture
Upload model from GitHub.
b410583
from tokenizers import ByteLevelBPETokenizer
paths = ['train_code.txt', 'train_doc.txt']
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
# Customize training
tokenizer.train(files=paths, vocab_size=32000, min_frequency=3, special_tokens=[
"<pad>",
"<s>",
"</s>",
"<unk>",
"<mask>"
])
# Save files to disk
tokenizer.save_model("./salesforce", "codet5")
print(
tokenizer.encode("<s> hello <unk> Don't you love 🤗 Transformers <mask> yes . </s>").tokens
)