File size: 672 Bytes
f4973d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model

# sp = sp_pb2_model.ModelProto()
# # sp.ParseFromString(open("zh_corpus.unigram.model", "rb").read())
# sp.ParseFromString(open("zh_corpus.bpe.user_defined_symbols.model", "rb").read())

import sentencepiece as spm
sp = spm.SentencePieceProcessor(model_file="Baichuan2-7B-Chat/tokenizer.model")
for text in ["汉堡王", "汉", "堡", "sfds<|USER|>ss</s><Rhino>", "<reserved_87254>", "<reserved_928>"]:
    result_str = sp.encode(text, out_type=str)
    result_int = sp.encode(text, out_type=int)
    print(result_str, result_int)