from tokenizers import Tokenizer tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json") def get_oov(): f_out = open("oov.txt", "w", encoding="utf-8") all_words = open("../../vocab.freq.zh.txt", "r", encoding="utf-8") for line in all_words: word, count = line.strip().split("\t") if "�" in word or word in ["之长", "个好", "亿亿", "余个", "聊了", "与该", "多花"]: continue encoding = tokenizer.encode(word) if len(encoding.ids) > 1: f_out.write(line) def build_vocab(): pass def convert_oov_to_merges(): """将词拆分成merge分组,必须是两个一组, 比如 承担 -> 承 担 天津市 -> 天津 市 社会保障 -> 社会 保障 的一部分 -> 的 一部分 -> 一 部分 """ all_tokens_and_counts = [line.strip().split("\t") for line in open("oov.txt", "r", encoding="utf-8")] all_tokens = [token for token,count in all_tokens_and_counts if int(count) > 2] # 至少3个词典中出现过 len1 = [token for token in all_tokens if len(token) == 1] len2 = [token for token in all_tokens if len(token) == 2] len3 = [token for token in all_tokens if len(token) == 3] len4 = [token for token in all_tokens if len(token) == 4] print(len(len1), len(len2), len(len3), len(len4)) # vocab = set(["天津", "社会", "保障", "部分", "一部分", "需要", "数据", "使用", "我们", "一个",] + len2) # vocab = set(["天津", "社会", "保障", "部分", "需要", "数据", "使用", "我们", "一个"] + len2) with open("oov.add.txt", "w", encoding="utf-8") as f_out: for token in len1: f_out.write(token + "\n") for token in len2[:20000]: f_out.write(token + "\n") # f_out.write(token[0] + " " + token[1] + "\n") # for token in len3: # idx = -1 # for part in len2: # if part in token: # idx = token.find(part) # break # if idx == -1: # print("not found", token) # elif idx == 0: # f_out.write(token[0] + " " + token[1:] + "\n") # else: # f_out.write(token[:2] + " " + token[2] + "\n") get_oov() convert_oov_to_merges()