Spaces:
Runtime error
Runtime error
import json | |
from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True) | |
vocab = tokenizer.get_vocab() | |
sorted_vocab = sorted(vocab.items(), key=lambda kv:kv[1]) | |
f_out = open("20B_tokenizer.txt", "w", encoding="utf-8") | |
for token, idx in sorted_vocab: | |
decoded_token = tokenizer.decode([idx]) | |
f_out.write(json.dumps( | |
{"id": idx, "token": token, "token_decode": decoded_token}, ensure_ascii=False) + "\n") | |
# f_out.write(json.dumps({"id": idx, "token": token, "token_decode": decoded_token}) + "\t" + token + "\t" + decoded_token + "\n") | |