File size: 2,341 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")

def get_oov():

    f_out = open("oov.txt", "w", encoding="utf-8")
    all_words = open("../../vocab.freq.zh.txt", "r", encoding="utf-8")
    for line in all_words:
        word, count = line.strip().split("\t")
        if "�" in word or word in ["之长", "个好", "亿亿", "余个", "聊了", "与该", "多花"]:
            continue

        encoding = tokenizer.encode(word)
        if len(encoding.ids) > 1:
            f_out.write(line)


def build_vocab():
    pass



def convert_oov_to_merges():
    """将词拆分成merge分组,必须是两个一组,
    比如
    承担 -> 承 担
    天津市 -> 天津 市
    社会保障 -> 社会 保障
    的一部分 -> 的 一部分 ->  一 部分
    """
    all_tokens_and_counts = [line.strip().split("\t") for line in open("oov.txt", "r", encoding="utf-8")]
    all_tokens = [token for token,count in all_tokens_and_counts if int(count) > 2]  # 至少3个词典中出现过
    len1 = [token for token in all_tokens if len(token) == 1]
    len2 = [token for token in all_tokens if len(token) == 2]
    len3 = [token for token in all_tokens if len(token) == 3]
    len4 = [token for token in all_tokens if len(token) == 4]
    print(len(len1), len(len2), len(len3), len(len4))

    # vocab = set(["天津", "社会", "保障", "部分", "一部分", "需要", "数据", "使用", "我们", "一个",] + len2)
    # vocab = set(["天津", "社会", "保障", "部分", "需要", "数据", "使用", "我们", "一个"] + len2)


    with open("oov.add.txt", "w", encoding="utf-8") as f_out:
        for token in len1:
            f_out.write(token + "\n")
        for token in len2[:20000]:
            f_out.write(token + "\n")
            # f_out.write(token[0] + " " + token[1] + "\n")

        # for token in len3:
        #     idx = -1
        #     for part in len2:
        #         if part in token:
        #             idx = token.find(part)
        #             break
        #     if idx == -1:
        #         print("not found", token)
        #     elif idx == 0:
        #         f_out.write(token[0] + " " + token[1:] + "\n")
        #     else:
        #         f_out.write(token[:2] + " " + token[2] + "\n")





get_oov()
convert_oov_to_merges()