Spaces:

xu-song
/

tokenizer-arena

Running

File size: 2,341 Bytes

751936e

from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")

def get_oov():

    f_out = open("oov.txt", "w", encoding="utf-8")
    all_words = open("../../vocab.freq.zh.txt", "r", encoding="utf-8")
    for line in all_words:
        word, count = line.strip().split("\t")
        if "�" in word or word in ["之长", "个好", "亿亿", "余个", "聊了", "与该", "多花"]:
            continue

        encoding = tokenizer.encode(word)
        if len(encoding.ids) > 1:
            f_out.write(line)


def build_vocab():
    pass



def convert_oov_to_merges():
    """将词拆分成merge分组，必须是两个一组，
    比如
    承担 -> 承 担
    天津市 -> 天津 市
    社会保障 -> 社会 保障
    的一部分 -> 的 一部分 ->  一 部分
    """
    all_tokens_and_counts = [line.strip().split("\t") for line in open("oov.txt", "r", encoding="utf-8")]
    all_tokens = [token for token,count in all_tokens_and_counts if int(count) > 2]  # 至少3个词典中出现过
    len1 = [token for token in all_tokens if len(token) == 1]
    len2 = [token for token in all_tokens if len(token) == 2]
    len3 = [token for token in all_tokens if len(token) == 3]
    len4 = [token for token in all_tokens if len(token) == 4]
    print(len(len1), len(len2), len(len3), len(len4))

    # vocab = set(["天津", "社会", "保障", "部分", "一部分", "需要", "数据", "使用", "我们", "一个",] + len2)
    # vocab = set(["天津", "社会", "保障", "部分", "需要", "数据", "使用", "我们", "一个"] + len2)


    with open("oov.add.txt", "w", encoding="utf-8") as f_out:
        for token in len1:
            f_out.write(token + "\n")
        for token in len2[:20000]:
            f_out.write(token + "\n")
            # f_out.write(token[0] + " " + token[1] + "\n")

        # for token in len3:
        #     idx = -1
        #     for part in len2:
        #         if part in token:
        #             idx = token.find(part)
        #             break
        #     if idx == -1:
        #         print("not found", token)
        #     elif idx == 0:
        #         f_out.write(token[0] + " " + token[1:] + "\n")
        #     else:
        #         f_out.write(token[:2] + " " + token[2] + "\n")





get_oov()
convert_oov_to_merges()