Spaces:

xu-song
/

tokenizer-arena

update

751936e about 1 year ago

555 Bytes


	import json
	error_tokens = [54611, 54612, 54613, 54614, 54615, 54616, 54617, 54618, 54619, 54620, 54621, 54622,
	54623, 54624, 54625, 54626, 54627, 54628, 54629, 54630, 54631, 54632, 54633]

	data = json.load(open("20B_tokenizer_chinese.v2.json", "r", encoding="utf-8"))
	vocab = data["model"]["vocab"]
	id2vocab = {idx: token for token, idx in vocab.items()}


	for token_id in error_tokens:
	token = id2vocab[token_id]
	for tmp in vocab:
	if token in tmp and token != tmp:
	print("catch")

	# print("a")
	# json.la