eson's picture
update
751936e
raw
history blame
No virus
2.34 kB
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")
def get_oov():
f_out = open("oov.txt", "w", encoding="utf-8")
all_words = open("../../vocab.freq.zh.txt", "r", encoding="utf-8")
for line in all_words:
word, count = line.strip().split("\t")
if "�" in word or word in ["之长", "个好", "亿亿", "余个", "聊了", "与该", "多花"]:
continue
encoding = tokenizer.encode(word)
if len(encoding.ids) > 1:
f_out.write(line)
def build_vocab():
pass
def convert_oov_to_merges():
"""将词拆分成merge分组,必须是两个一组,
比如
承担 -> 承 担
天津市 -> 天津 市
社会保障 -> 社会 保障
的一部分 -> 的 一部分 -> 一 部分
"""
all_tokens_and_counts = [line.strip().split("\t") for line in open("oov.txt", "r", encoding="utf-8")]
all_tokens = [token for token,count in all_tokens_and_counts if int(count) > 2] # 至少3个词典中出现过
len1 = [token for token in all_tokens if len(token) == 1]
len2 = [token for token in all_tokens if len(token) == 2]
len3 = [token for token in all_tokens if len(token) == 3]
len4 = [token for token in all_tokens if len(token) == 4]
print(len(len1), len(len2), len(len3), len(len4))
# vocab = set(["天津", "社会", "保障", "部分", "一部分", "需要", "数据", "使用", "我们", "一个",] + len2)
# vocab = set(["天津", "社会", "保障", "部分", "需要", "数据", "使用", "我们", "一个"] + len2)
with open("oov.add.txt", "w", encoding="utf-8") as f_out:
for token in len1:
f_out.write(token + "\n")
for token in len2[:20000]:
f_out.write(token + "\n")
# f_out.write(token[0] + " " + token[1] + "\n")
# for token in len3:
# idx = -1
# for part in len2:
# if part in token:
# idx = token.find(part)
# break
# if idx == -1:
# print("not found", token)
# elif idx == 0:
# f_out.write(token[0] + " " + token[1:] + "\n")
# else:
# f_out.write(token[:2] + " " + token[2] + "\n")
get_oov()
convert_oov_to_merges()