import os os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model # sp = sp_pb2_model.ModelProto() # # sp.ParseFromString(open("zh_corpus.unigram.model", "rb").read()) # sp.ParseFromString(open("zh_corpus.bpe.user_defined_symbols.model", "rb").read()) import sentencepiece as spm sp = spm.SentencePieceProcessor(model_file="Baichuan2-7B-Chat/tokenizer.model") for text in ["汉堡王", "汉", "堡", "sfds<|USER|>ss", "", ""]: result_str = sp.encode(text, out_type=str) result_int = sp.encode(text, out_type=int) print(result_str, result_int)