File size: 542 Bytes
0ce6477
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

from utils.zh_util import is_chinese
from transformers import LlamaTokenizer
llama_vocab = LlamaTokenizer.from_pretrained("../tokenizer").get_vocab()

from vocab.gpt_neox_chinese_v1 import tokenizer

vocab = tokenizer.get_vocab()

f_out = open("append_zh.txt", "w", encoding="utf-8")

for token, token_id in vocab.items():
    token = tokenizer.decode([token_id])
    # token = token.strip("Ġ")
    if len(token) < 1:
        continue
    if is_chinese(token[0]):
        if token not in llama_vocab:
            f_out.write(token + "\n")