import os | |
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" | |
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model | |
import sentencepiece as spm | |
baichuan_spm = sp_pb2_model.ModelProto() | |
baichuan_spm.ParseFromString(open("Baichuan2-7B-Chat/tokenizer.model", "rb").read()) | |
vocab_size = len(baichuan_spm.pieces) | |
for i in range(vocab_size): | |
piece = baichuan_spm.pieces[i] | |
if "reser" in piece.piece: | |
print(i, str(piece).strip().replace("\n", ", ")) |