tokenizer-arena / vocab /gpt_nexo_20b /convert_vocab_to_txt.py
xu-song's picture
update
751936e
raw
history blame
459 Bytes
import json
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("20B_tokenizer.json")
vocab = tokenizer.get_vocab()
sorted_vocab = sorted(vocab.items(), key=lambda kv:kv[1])
f_out = open("20B_tokenizer.txt", "w", encoding="utf-8")
for token, idx in sorted_vocab:
decoded_token = tokenizer.decode([idx])
f_out.write(json.dumps({"id": idx, "token": token, "token_decode": decoded_token}) + "\t" + token + "\t" + decoded_token + "\n")