from icetk import icetk tokens = icetk.tokenize('Hello World! I am icetk.') ids = icetk.encode('你好世界!这里是 icetk。') print(ids) # ids == [20005, 94874, 84097, 20035, 94947, 22881, 35955, 83823] # tokens = icetk.encode('你好世界!这里是 icetk。') for token in tokens: print(token, icetk.text_tokenizer.proto.pieces[token - 20000].piece) # print(i, icetk.decode(tokens[i:i + 1])) # print(icetk.text_tokenizer.proto.pieces[token-20000])