from icetk import icetk | |
tokens = icetk.tokenize('Hello World! I am icetk.') | |
ids = icetk.encode('你好世界!这里是 icetk。') | |
print(ids) | |
# ids == [20005, 94874, 84097, 20035, 94947, 22881, 35955, 83823] | |
# tokens = icetk.encode('你好世界!这里是 icetk。') | |
for token in tokens: | |
print(token, icetk.text_tokenizer.proto.pieces[token - 20000].piece) | |
# print(i, icetk.decode(tokens[i:i + 1])) | |
# print(icetk.text_tokenizer.proto.pieces[token-20000]) |