xu-song's picture
update
751936e
raw
history blame
555 Bytes
import json
error_tokens = [54611, 54612, 54613, 54614, 54615, 54616, 54617, 54618, 54619, 54620, 54621, 54622,
54623, 54624, 54625, 54626, 54627, 54628, 54629, 54630, 54631, 54632, 54633]
data = json.load(open("20B_tokenizer_chinese.v2.json", "r", encoding="utf-8"))
vocab = data["model"]["vocab"]
id2vocab = {idx: token for token, idx in vocab.items()}
for token_id in error_tokens:
token = id2vocab[token_id]
for tmp in vocab:
if token in tmp and token != tmp:
print("catch")
# print("a")
# json.la