Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

File size: 555 Bytes

751936e


import json
error_tokens = [54611, 54612, 54613, 54614, 54615, 54616, 54617, 54618, 54619, 54620, 54621, 54622,
                     54623, 54624, 54625, 54626, 54627, 54628, 54629, 54630, 54631, 54632, 54633]

data = json.load(open("20B_tokenizer_chinese.v2.json", "r", encoding="utf-8"))
vocab = data["model"]["vocab"]
id2vocab = {idx: token for token, idx in vocab.items()}


for token_id in error_tokens:
    token = id2vocab[token_id]
    for tmp in vocab:
        if token in tmp and token != tmp:
            print("catch")

# print("a")
# json.la