dolphin-2.6-mistral-7b / configs /modify-tokenizer.py
ehartford's picture
Upload folder using huggingface_hub
4b6b09c
raw
history blame contribute delete
No virus
517 Bytes
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("/workspace/dolphin-2.6-mistral-7b-hf")
# 1. Remove the "</s>" token from the vocabulary
vocab = tokenizer.get_vocab()
del vocab['</s>']
vocab['<|im_end|>'] = 2
tokenizer = AutoTokenizer.from_pretrained(
"/workspace/dolphin-2.6-mistral-7b-hf",
vocab=vocab
)
tokenizer.eos_token = "<|im_end|>"
tokenizer.pad_token = "<|im_end|>"
# 5. Save the modified tokenizer
tokenizer.save_pretrained('/workspace/dolphin-new-tokenizer/')