File size: 476 Bytes
751936e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
from icetk import icetk
tokens = icetk.tokenize('Hello World! I am icetk.')
ids = icetk.encode('你好世界!这里是 icetk。')
print(ids)
# ids == [20005, 94874, 84097, 20035, 94947, 22881, 35955, 83823]
# tokens = icetk.encode('你好世界!这里是 icetk。')
for token in tokens:
print(token, icetk.text_tokenizer.proto.pieces[token - 20000].piece)
# print(i, icetk.decode(tokens[i:i + 1]))
# print(icetk.text_tokenizer.proto.pieces[token-20000]) |