tokenizer-arena / vocab /baichuan2 /special_token.py
eson's picture
add more tokenizers
f4973d4
raw history blame
No virus
486 Bytes
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
import sentencepiece as spm
baichuan_spm = sp_pb2_model.ModelProto()
baichuan_spm.ParseFromString(open("Baichuan2-7B-Chat/tokenizer.model", "rb").read())
vocab_size = len(baichuan_spm.pieces)
for i in range(vocab_size):
piece = baichuan_spm.pieces[i]
if "reser" in piece.piece:
print(i, str(piece).strip().replace("\n", ", "))