import json error_tokens = [54611, 54612, 54613, 54614, 54615, 54616, 54617, 54618, 54619, 54620, 54621, 54622, 54623, 54624, 54625, 54626, 54627, 54628, 54629, 54630, 54631, 54632, 54633] data = json.load(open("20B_tokenizer_chinese.v2.json", "r", encoding="utf-8")) vocab = data["model"]["vocab"] id2vocab = {idx: token for token, idx in vocab.items()} for token_id in error_tokens: token = id2vocab[token_id] for tmp in vocab: if token in tmp and token != tmp: print("catch") # print("a") # json.la