File size: 689 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import json


from tokenization import make_tokenizer

add_sentinel_token = 0
tokenizer = make_tokenizer("ChineseSPTokenizer", None, "tokenizer.model", "50048",
                           None, add_block_symbols=True, cache_dir="cache",
                           add_sentinel_token=add_sentinel_token, add_task_mask=True,
                           add_decoder_mask=False,
                           fix_command_token=False)


f_out = open("glm_chinese.vocab.txt", "w", encoding="utf-8")
for idx in range(tokenizer.num_tokens):
    try:
        decode_str = tokenizer.DecodeIds([idx])
        f_out.write("%d\t%s\n" % (idx, decode_str))
    except Exception as e:
        print(idx, e)