File size: 672 Bytes
f4973d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
# sp = sp_pb2_model.ModelProto()
# # sp.ParseFromString(open("zh_corpus.unigram.model", "rb").read())
# sp.ParseFromString(open("zh_corpus.bpe.user_defined_symbols.model", "rb").read())
import sentencepiece as spm
sp = spm.SentencePieceProcessor(model_file="Baichuan2-7B-Chat/tokenizer.model")
for text in ["汉堡王", "汉", "堡", "sfds<|USER|>ss</s><Rhino>", "<reserved_87254>", "<reserved_928>"]:
result_str = sp.encode(text, out_type=str)
result_int = sp.encode(text, out_type=int)
print(result_str, result_int) |