eson's picture
update
751936e
raw history blame
No virus
552 Bytes
from transformers import AutoTokenizer, GPTNeoXJapaneseTokenizer
tokenizer = GPTNeoXJapaneseTokenizer.from_pretrained("tokenizer")
# tokenizer = AutoTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
tokens = tokenizer.encode("人とAIが協調するためには http://baidu.com 🤣")
for token in tokens:
print(token, tokenizer.decode([token]))
tokens = tokenizer.tokenize("人とAIが協調するためには http://baidu.com 🤣", clean=True)
print(tokens)
# for token in tokens:
# print(token, tokenizer.decode([token]))