eson's picture
add more tokenizer
5425d5d
raw
history blame
No virus
491 Bytes
import os
from transformers import GPT2Tokenizer
from vocab import TokenizerType, TokenizerImpl
# CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
# TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
# tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER_DIR)
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
# tokenizer.type = TokenizerType.
# 源码 https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/tokenization_gpt2.py