|
from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe |
|
|
|
def gpt_j_hebrew_tokenizer(): |
|
mergeable_ranks = data_gym_to_mergeable_bpe_ranks( |
|
vocab_bpe_file="https://huggingface.co/Norod78/gpt-j-hebrew-tokenizer/raw/main/merges.txt", |
|
encoder_json_file="https://huggingface.co/Norod78/gpt-j-hebrew-tokenizer/raw/main/vocab.json", |
|
) |
|
return { |
|
"name": "gpt-j-hebrew-tokenizer", |
|
"explicit_n_vocab": 50257, |
|
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", |
|
"mergeable_ranks": mergeable_ranks, |
|
"special_tokens": {"<|endoftext|>": 50256}, |
|
} |
|
|
|
def gpt_hebrew_tokenizer(): |
|
mergeable_ranks = data_gym_to_mergeable_bpe_ranks( |
|
vocab_bpe_file="https://huggingface.co/Norod78/TinyStories-3M-val-Hebrew/raw/main/merges.txt", |
|
encoder_json_file="https://huggingface.co/Norod78/TinyStories-3M-val-Hebrew/raw/main/vocab.json", |
|
) |
|
return { |
|
"name": "gpt-hebrew-tokenizer", |
|
"explicit_n_vocab": 50259, |
|
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", |
|
"mergeable_ranks": mergeable_ranks, |
|
"special_tokens": {"<|endoftext|>": 50256, "<|startoftext|>": 50257, "<|pad|>": 50258}, |
|
} |
|
|
|
ENCODING_CONSTRUCTORS = { |
|
"gpt-j-hebrew-tokenizer": gpt_j_hebrew_tokenizer, |
|
"gpt-hebrew-tokenizer": gpt_hebrew_tokenizer, |
|
} |
|
|