xu-song's picture
add more tokenizers
f4973d4
raw
history blame
292 Bytes
tokenizer = None
def test():
encoding = tokenizer.encode("测试华为手机10086 8个空格")
for token_id in encoding:
token = tokenizer.convert_ids_to_tokens([token_id])[0].decode("utf-8")
print(token_id, ":", token)
if __name__ == "__main__":
test()