File size: 487 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
"""

https://github.com/EleutherAI/gpt-neox/blob/main/tools/corpora.py

##

"""


from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")

tokens = tokenizer.encode("good night\n中国      ss一个人去哪里")

print(tokens)
print(tokenizer.decode(tokens))
for token in tokens:
    print(token, tokenizer.decode([token]))