File size: 877 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import json
from collections import defaultdict

all_zh_words = defaultdict(int)
for model_name in [
    "gpt2",
    "gpt2_chinese",
    "chinese_llama_lora_7b",
    "bert_chinese",
    "moss",
    "bloom",
    "bloomz_6b4_zh",
    "gpt_nexo_20b",
    "gpt_neox_chinese_v1",
    "glm_chinese",
    "chatglm"
]:
    zh_word_set = set()
    for line in open(model_name + "_vocab.zh.jsonl", "r", encoding="utf-8"):
        item = json.loads(line)
        token = item["token"]
        if item["type"] in ["中文单字", "中文多字"]:
            zh_word_set.add(token.strip())

    for word in zh_word_set:
        all_zh_words[word] += 1

sorted_keywords = sorted(all_zh_words.items(), key=lambda kv: kv[1], reverse=True)

with open("vocab.freq.zh.txt", "w", encoding="utf-8") as f_out:
    for word, count in sorted_keywords:
        f_out.write("%s\t%d\n" % (word, count))