File size: 695 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
"""

"""

from vocab.gpt2 import tokenizer
# from transformers import GPT2Tokenizer
# # tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer = GPT2Tokenizer.from_pretrained("tokenizer")

print(tokenizer.bpe('中国'))

#

print(tokenizer.encode("Hello world"))  # 默认 add_prefix_space=False
print(tokenizer.encode("Hello world", add_prefix_space=True))
print(tokenizer.encode(" Hello world"))
print(tokenizer.encode("Hello world", add_special_tokens=True))  # add_special_tokens 没用


print(tokenizer.encode(text='中国\n', add_special_tokens=False))
#
# print(tokenizer.encode(text='中国', add_special_tokens=False))
#
# print(tokenizer.tokenize('I love Salah and  salad'))