tokenizer-arena / vocab /build_zh_vocab.py
eson's picture
update
751936e
raw history blame
No virus
877 Bytes
import json
from collections import defaultdict
all_zh_words = defaultdict(int)
for model_name in [
"gpt2",
"gpt2_chinese",
"chinese_llama_lora_7b",
"bert_chinese",
"moss",
"bloom",
"bloomz_6b4_zh",
"gpt_nexo_20b",
"gpt_neox_chinese_v1",
"glm_chinese",
"chatglm"
]:
zh_word_set = set()
for line in open(model_name + "_vocab.zh.jsonl", "r", encoding="utf-8"):
item = json.loads(line)
token = item["token"]
if item["type"] in ["中文单字", "中文多字"]:
zh_word_set.add(token.strip())
for word in zh_word_set:
all_zh_words[word] += 1
sorted_keywords = sorted(all_zh_words.items(), key=lambda kv: kv[1], reverse=True)
with open("vocab.freq.zh.txt", "w", encoding="utf-8") as f_out:
for word, count in sorted_keywords:
f_out.write("%s\t%d\n" % (word, count))