import json from collections import defaultdict all_zh_words = defaultdict(int) for model_name in [ "gpt2", "gpt2_chinese", "chinese_llama_lora_7b", "bert_chinese", "moss", "bloom", "bloomz_6b4_zh", "gpt_nexo_20b", "gpt_neox_chinese_v1", "glm_chinese", "chatglm" ]: zh_word_set = set() for line in open(model_name + "_vocab.zh.jsonl", "r", encoding="utf-8"): item = json.loads(line) token = item["token"] if item["type"] in ["中文单字", "中文多字"]: zh_word_set.add(token.strip()) for word in zh_word_set: all_zh_words[word] += 1 sorted_keywords = sorted(all_zh_words.items(), key=lambda kv: kv[1], reverse=True) with open("vocab.freq.zh.txt", "w", encoding="utf-8") as f_out: for word, count in sorted_keywords: f_out.write("%s\t%d\n" % (word, count))