File size: 486 Bytes
f4973d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
import sentencepiece as spm

baichuan_spm = sp_pb2_model.ModelProto()
baichuan_spm.ParseFromString(open("Baichuan2-7B-Chat/tokenizer.model", "rb").read())


vocab_size = len(baichuan_spm.pieces)
for i in range(vocab_size):
    piece = baichuan_spm.pieces[i]
    if "reser" in piece.piece:
        print(i, str(piece).strip().replace("\n", ", "))