Update tiktoken_tokenizer.py
Browse files- tiktoken_tokenizer.py +10 -13
tiktoken_tokenizer.py
CHANGED
@@ -53,24 +53,21 @@ class BaseTokenizer(PreTrainedTokenizer):
|
|
53 |
return NotImplemented
|
54 |
|
55 |
class TikTokenizer(BaseTokenizer):
|
56 |
-
|
57 |
-
def from_pretrained(path, *inputs, **kwargs):
|
58 |
-
return TikTokenizer(vocab_file=os.path.join(path, "tokenizer.tiktoken"))
|
59 |
|
60 |
-
def __init__(self, vocab_file
|
61 |
pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
62 |
self.pat_str = re.compile(pat_str)
|
63 |
|
64 |
self.b64_vocab = {}
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
self.b64_vocab['%s' % token] = rank
|
74 |
|
75 |
self.special_tokens = ["<|endoftext|>", "[MASK]", "[gMASK]", "[sMASK]", "<sop>", "<eop>", "<|system|>",
|
76 |
"<|user|>", "<|assistant|>", "<|observation|>"]
|
|
|
53 |
return NotImplemented
|
54 |
|
55 |
class TikTokenizer(BaseTokenizer):
|
56 |
+
vocab_files_names = {"vocab_file": "tokenizer.tiktoken"}
|
|
|
|
|
57 |
|
58 |
+
def __init__(self, vocab_file, **kwargs):
|
59 |
pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
60 |
self.pat_str = re.compile(pat_str)
|
61 |
|
62 |
self.b64_vocab = {}
|
63 |
+
mergeable_ranks = {}
|
64 |
+
with open(vocab_file) as f:
|
65 |
+
for line in f:
|
66 |
+
token, rank = line.strip().split()
|
67 |
+
rank = int(rank)
|
68 |
+
token = base64.b64decode(token)
|
69 |
+
mergeable_ranks[token] = rank
|
70 |
+
self.b64_vocab['%s' % token] = rank
|
|
|
71 |
|
72 |
self.special_tokens = ["<|endoftext|>", "[MASK]", "[gMASK]", "[sMASK]", "<sop>", "<eop>", "<|system|>",
|
73 |
"<|user|>", "<|assistant|>", "<|observation|>"]
|