from utils.zh_util import is_chinese from transformers import LlamaTokenizer llama_vocab = LlamaTokenizer.from_pretrained("../tokenizer").get_vocab() from vocab.gpt_neox_chinese_v1 import tokenizer vocab = tokenizer.get_vocab() f_out = open("append_zh.txt", "w", encoding="utf-8") for token, token_id in vocab.items(): token = tokenizer.decode([token_id]) # token = token.strip("Ġ") if len(token) < 1: continue if is_chinese(token[0]): if token not in llama_vocab: f_out.write(token + "\n")