ibrim's picture
Upload 20 files
b347aa0 verified
"""
Implements the GPT-4 Tokenizer as a light wrapper around the RegexTokenizer.
Note that this is a pretrained tokenizer. By default and inside init(), it
loads the pretrained tokenizer from the `cl100k_base` tokenizer of tiktoken.
"""
import tiktoken
from .regex import RegexTokenizer
def bpe(mergeable_ranks, token, max_rank):
# helper function used in get_gpt4_merges() to reconstruct the merge forest
parts = [bytes([b]) for b in token]
while True:
min_idx = None
min_rank = None
for i, pair in enumerate(zip(parts[:-1], parts[1:])):
rank = mergeable_ranks.get(pair[0] + pair[1])
if rank is not None and (min_rank is None or rank < min_rank):
min_idx = i
min_rank = rank
if min_rank is None or (max_rank is not None and min_rank >= max_rank):
break
assert min_idx is not None
parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
return parts
def recover_merges(mergeable_ranks):
# the `merges` are already the byte sequences in their merged state.
# so we have to recover the original pairings. We can do this by doing
# a small BPE training run on all the tokens, in their order.
# also see https://github.com/openai/tiktoken/issues/60
# also see https://github.com/karpathy/minbpe/issues/11#issuecomment-1950805306
merges = {}
for token, rank in mergeable_ranks.items():
if len(token) == 1:
continue # skip raw bytes
pair = tuple(bpe(mergeable_ranks, token, max_rank=rank))
assert len(pair) == 2
# recover the integer ranks of the pair
ix0 = mergeable_ranks[pair[0]]
ix1 = mergeable_ranks[pair[1]]
merges[(ix0, ix1)] = rank
return merges
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
GPT4_SPECIAL_TOKENS = {
'<|endoftext|>': 100257,
'<|fim_prefix|>': 100258,
'<|fim_middle|>': 100259,
'<|fim_suffix|>': 100260,
'<|endofprompt|>': 100276
}
class GPT4Tokenizer(RegexTokenizer):
"""Lightweight wrapper on RegexTokenizer that matches GPT-4's tokenizer."""
def __init__(self):
super().__init__(pattern=GPT4_SPLIT_PATTERN)
# get the official tokenizer and its merges
enc = tiktoken.get_encoding("cl100k_base")
mergeable_ranks = enc._mergeable_ranks
# the merges are those of gpt4, but we have to recover them
self.merges = recover_merges(mergeable_ranks)
# reconstruct the vocab from the merges
vocab = {idx: bytes([idx]) for idx in range(256)}
for (p0, p1), idx in self.merges.items():
vocab[idx] = vocab[p0] + vocab[p1]
self.vocab = vocab
# now here is another tricky part.
# for some reason, the tokens corresponding to individual bytes
# are permuted in a different order. This is completely non-sensical
# and probably historical, but therefore we have to deal with it here.
self.byte_shuffle = {i: mergeable_ranks[bytes([i])] for i in range(256)}
self.inverse_byte_shuffle = {v: k for k, v in self.byte_shuffle.items()}
# finally register the special tokens
self.register_special_tokens(GPT4_SPECIAL_TOKENS)
def _encode_chunk(self, text_bytes):
# before we start processing bytes, we have to permute them
text_bytes = bytes(self.byte_shuffle[b] for b in text_bytes)
ids = super()._encode_chunk(text_bytes)
return ids
def decode(self, ids):
# we have to un-permute the bytes before we decode
text_bytes = b"".join(self.vocab[idx] for idx in ids)
text_bytes = bytes(self.inverse_byte_shuffle[b] for b in text_bytes)
text = text_bytes.decode("utf-8", errors="replace")
return text
# this is a pretrained tokenizer, it is not intended to be trained
def train(self, text, vocab_size, verbose=False):
raise NotImplementedError
# save/load would require some thought.
# we'd have to change save/load of base to add support for byte_shuffle...
# alternatively, we could move byte_shuffle to base class, but that would
# mean that we're making ugly our beautiful Tokenizer just to support
# the GPT-4 tokenizer and its weird historical quirks around byte_shuffle.
def save(self, file_prefix):
raise NotImplementedError("GPT4Tokenizer cannot be saved.")
def load(self, model_file):
raise NotImplementedError("GPT4Tokenizer cannot be loaded.")
def save_vocab(self, vocab_file):
# just for visualization purposes let's output the GPT-4 tokens
# in the exact same format as the base class would.
# simple run as:
# python -c "from minbpe import GPT4Tokenizer; GPT4Tokenizer().save_vocab('gpt4.vocab')"
from .base import render_token
# build vocab being mindful of the byte shuffle
vocab = {idx: bytes([self.inverse_byte_shuffle[idx]]) for idx in range(256)}
for (p0, p1), idx in self.merges.items():
vocab[idx] = vocab[p0] + vocab[p1]
# now merge the shuffled bytes and write to file
inverted_merges = {idx: pair for pair, idx in self.merges.items()}
with open(vocab_file, "w", encoding="utf-8") as f:
for idx, token in vocab.items():
s = render_token(token)
if idx in inverted_merges:
idx0, idx1 = inverted_merges[idx]
s0 = render_token(vocab[idx0])
s1 = render_token(vocab[idx1])
f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
else:
f.write(f"[{s}] {idx}\n")