TinyStories-3M-val-Hebrew / tiktoken /tiktoken_ext /tiktoken_ext_norod78_hf.py
Norod78's picture
Add TikToken extention support for the Hebrew Tokenizer
830833d
from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
def gpt_j_hebrew_tokenizer():
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
vocab_bpe_file="https://huggingface.co/Norod78/gpt-j-hebrew-tokenizer/raw/main/merges.txt",
encoder_json_file="https://huggingface.co/Norod78/gpt-j-hebrew-tokenizer/raw/main/vocab.json",
)
return {
"name": "gpt-j-hebrew-tokenizer",
"explicit_n_vocab": 50257,
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {"<|endoftext|>": 50256},
}
def gpt_hebrew_tokenizer():
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
vocab_bpe_file="https://huggingface.co/Norod78/TinyStories-3M-val-Hebrew/raw/main/merges.txt",
encoder_json_file="https://huggingface.co/Norod78/TinyStories-3M-val-Hebrew/raw/main/vocab.json",
)
return {
"name": "gpt-hebrew-tokenizer",
"explicit_n_vocab": 50259,
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {"<|endoftext|>": 50256, "<|startoftext|>": 50257, "<|pad|>": 50258},
}
ENCODING_CONSTRUCTORS = {
"gpt-j-hebrew-tokenizer": gpt_j_hebrew_tokenizer,
"gpt-hebrew-tokenizer": gpt_hebrew_tokenizer,
}