LightFury9
commited on
Upload 2 files
Browse files- tenglish_arcade.tiktoken +0 -0
- tokenization_arcade100k.py +7 -7
tenglish_arcade.tiktoken
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tokenization_arcade100k.py
CHANGED
@@ -113,13 +113,13 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
|
|
113 |
self._tiktoken_config = _arcade100k(vocab_file)
|
114 |
self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
|
115 |
|
116 |
-
# TODO: Remove this assertion
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
|
124 |
self.decoder = {i: n for n, i in self.tokenizer._mergeable_ranks.items()}
|
125 |
self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
|
|
|
113 |
self._tiktoken_config = _arcade100k(vocab_file)
|
114 |
self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
|
115 |
|
116 |
+
# # TODO: Remove this assertion
|
117 |
+
assert (
|
118 |
+
len(self.tokenizer._mergeable_ranks)
|
119 |
+
+ len(self.tokenizer._special_tokens)
|
120 |
+
+ 1
|
121 |
+
== self.tokenizer.n_vocab
|
122 |
+
), f"{len(self.tokenizer._mergeable_ranks) + len(self.tokenizer._special_tokens)} != {self.tokenizer.n_vocab} in encoding"
|
123 |
|
124 |
self.decoder = {i: n for n, i in self.tokenizer._mergeable_ranks.items()}
|
125 |
self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
|