Spaces:

eson
/

tokenizer-arena

Running

App Files Files Community

tokenizer-arena / vocab /gpt_neox_chinese_v1 /to_v2 /test_oov.py

eson

update

751936e about 1 year ago

raw

history blame

No virus

2.34 kB

	from tokenizers import Tokenizer

	tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")

	def get_oov():

	f_out = open("oov.txt", "w", encoding="utf-8")
	all_words = open("../../vocab.freq.zh.txt", "r", encoding="utf-8")
	for line in all_words:
	word, count = line.strip().split("\t")
	if "�" in word or word in ["之长", "个好", "亿亿", "余个", "聊了", "与该", "多花"]:
	continue

	encoding = tokenizer.encode(word)
	if len(encoding.ids) > 1:
	f_out.write(line)


	def build_vocab():
	pass



	def convert_oov_to_merges():
	"""将词拆分成merge分组，必须是两个一组，
	比如
	承担 -> 承担
	天津市 -> 天津市
	社会保障 -> 社会保障
	的一部分 -> 的一部分 -> 一部分
	"""
	all_tokens_and_counts = [line.strip().split("\t") for line in open("oov.txt", "r", encoding="utf-8")]
	all_tokens = [token for token,count in all_tokens_and_counts if int(count) > 2] # 至少3个词典中出现过
	len1 = [token for token in all_tokens if len(token) == 1]
	len2 = [token for token in all_tokens if len(token) == 2]
	len3 = [token for token in all_tokens if len(token) == 3]
	len4 = [token for token in all_tokens if len(token) == 4]
	print(len(len1), len(len2), len(len3), len(len4))

	# vocab = set(["天津", "社会", "保障", "部分", "一部分", "需要", "数据", "使用", "我们", "一个",] + len2)
	# vocab = set(["天津", "社会", "保障", "部分", "需要", "数据", "使用", "我们", "一个"] + len2)


	with open("oov.add.txt", "w", encoding="utf-8") as f_out:
	for token in len1:
	f_out.write(token + "\n")
	for token in len2[:20000]:
	f_out.write(token + "\n")
	# f_out.write(token[0] + " " + token[1] + "\n")

	# for token in len3:
	# idx = -1
	# for part in len2:
	# if part in token:
	# idx = token.find(part)
	# break
	# if idx == -1:
	# print("not found", token)
	# elif idx == 0:
	# f_out.write(token[0] + " " + token[1:] + "\n")
	# else:
	# f_out.write(token[:2] + " " + token[2] + "\n")





	get_oov()
	convert_oov_to_merges()