eson's picture
add more tokenizers
f4973d4
raw history blame
No virus
672 Bytes
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
# sp = sp_pb2_model.ModelProto()
# # sp.ParseFromString(open("zh_corpus.unigram.model", "rb").read())
# sp.ParseFromString(open("zh_corpus.bpe.user_defined_symbols.model", "rb").read())
import sentencepiece as spm
sp = spm.SentencePieceProcessor(model_file="Baichuan2-7B-Chat/tokenizer.model")
for text in ["汉堡王", "汉", "堡", "sfds<|USER|>ss</s><Rhino>", "<reserved_87254>", "<reserved_928>"]:
result_str = sp.encode(text, out_type=str)
result_int = sp.encode(text, out_type=int)
print(result_str, result_int)