|
from transformers import GPT2Tokenizer |
|
|
|
class FLMTokenizer(GPT2Tokenizer): |
|
model_input_names = ["input_ids", "attention_mask"] |
|
def __init__( |
|
self, |
|
vocab_file, |
|
merges_file, |
|
errors="replace", |
|
unk_token="<|endoftext|>", |
|
bos_token="<|endoftext|>", |
|
eos_token="<|endoftext|>", |
|
pad_token=None, |
|
add_prefix_space=False, |
|
add_bos_token=False, |
|
**kwargs, |
|
): |
|
super().__init__( |
|
vocab_file, |
|
merges_file, |
|
errors=errors, |
|
unk_token=unk_token, |
|
bos_token=bos_token, |
|
eos_token=eos_token, |
|
pad_token=pad_token, |
|
add_prefix_space=add_prefix_space, |
|
add_bos_token=add_bos_token, |
|
**kwargs, |
|
) |
|
self.pat = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" |