tokenizer-arena / vocab /icetk /test_icetk.py
eson's picture
update
751936e
raw history blame
No virus
476 Bytes
from icetk import icetk
tokens = icetk.tokenize('Hello World! I am icetk.')
ids = icetk.encode('你好世界!这里是 icetk。')
print(ids)
# ids == [20005, 94874, 84097, 20035, 94947, 22881, 35955, 83823]
# tokens = icetk.encode('你好世界!这里是 icetk。')
for token in tokens:
print(token, icetk.text_tokenizer.proto.pieces[token - 20000].piece)
# print(i, icetk.decode(tokens[i:i + 1]))
# print(icetk.text_tokenizer.proto.pieces[token-20000])