File size: 487 Bytes
751936e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
"""
https://github.com/EleutherAI/gpt-neox/blob/main/tools/corpora.py
##
"""
from transformers import AutoTokenizer, AutoModelForCausalLM
# tokenizer = AutoTokenizer.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
tokens = tokenizer.encode("good night\n中国 ss一个人去哪里")
print(tokens)
print(tokenizer.decode(tokens))
for token in tokens:
print(token, tokenizer.decode([token])) |