File size: 615 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import json
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True)

vocab = tokenizer.get_vocab()

sorted_vocab = sorted(vocab.items(), key=lambda kv:kv[1])

f_out = open("20B_tokenizer.txt", "w", encoding="utf-8")
for token, idx in sorted_vocab:
    decoded_token = tokenizer.decode([idx])
    f_out.write(json.dumps(
        {"id": idx, "token": token, "token_decode": decoded_token}, ensure_ascii=False) + "\n")
    # f_out.write(json.dumps({"id": idx, "token": token, "token_decode": decoded_token}) + "\t" + token + "\t" + decoded_token + "\n")