File size: 517 Bytes
4b6b09c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("/workspace/dolphin-2.6-mistral-7b-hf")
# 1. Remove the "</s>" token from the vocabulary
vocab = tokenizer.get_vocab()
del vocab['</s>']
vocab['<|im_end|>'] = 2
tokenizer = AutoTokenizer.from_pretrained(
"/workspace/dolphin-2.6-mistral-7b-hf",
vocab=vocab
)
tokenizer.eos_token = "<|im_end|>"
tokenizer.pad_token = "<|im_end|>"
# 5. Save the modified tokenizer
tokenizer.save_pretrained('/workspace/dolphin-new-tokenizer/') |