tokenizer-arena / vocab /gpt2 /test_hf_gpt2.py
eson's picture
update
751936e
raw
history blame
No virus
695 Bytes
"""
"""
from vocab.gpt2 import tokenizer
# from transformers import GPT2Tokenizer
# # tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer = GPT2Tokenizer.from_pretrained("tokenizer")
print(tokenizer.bpe('中国'))
#
print(tokenizer.encode("Hello world")) # 默认 add_prefix_space=False
print(tokenizer.encode("Hello world", add_prefix_space=True))
print(tokenizer.encode(" Hello world"))
print(tokenizer.encode("Hello world", add_special_tokens=True)) # add_special_tokens 没用
print(tokenizer.encode(text='中国\n', add_special_tokens=False))
#
# print(tokenizer.encode(text='中国', add_special_tokens=False))
#
# print(tokenizer.tokenize('I love Salah and salad'))