""" """ from vocab.gpt2 import tokenizer # from transformers import GPT2Tokenizer # # tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # tokenizer = GPT2Tokenizer.from_pretrained("tokenizer") print(tokenizer.bpe('中国')) # print(tokenizer.encode("Hello world")) # 默认 add_prefix_space=False print(tokenizer.encode("Hello world", add_prefix_space=True)) print(tokenizer.encode(" Hello world")) print(tokenizer.encode("Hello world", add_special_tokens=True)) # add_special_tokens 没用 print(tokenizer.encode(text='中国\n', add_special_tokens=False)) # # print(tokenizer.encode(text='中国', add_special_tokens=False)) # # print(tokenizer.tokenize('I love Salah and salad'))