LightFury9 commited on
Commit
ba7e1d0
·
verified ·
1 Parent(s): 99fd8a8

Upload 2 files

Browse files
tenglish_arcade.tiktoken CHANGED
The diff for this file is too large to render. See raw diff
 
tokenization_arcade100k.py CHANGED
@@ -113,13 +113,13 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
113
  self._tiktoken_config = _arcade100k(vocab_file)
114
  self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
115
 
116
- # TODO: Remove this assertion
117
- #assert (
118
- # len(self.tokenizer._mergeable_ranks)
119
- # + len(self.tokenizer._special_tokens)
120
- # + 1
121
- # == self.tokenizer.n_vocab
122
- #), f"{len(self.tokenizer._mergeable_ranks) + len(self.tokenizer._special_tokens)} != {self.tokenizer.n_vocab} in encoding"
123
 
124
  self.decoder = {i: n for n, i in self.tokenizer._mergeable_ranks.items()}
125
  self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
 
113
  self._tiktoken_config = _arcade100k(vocab_file)
114
  self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
115
 
116
+ # # TODO: Remove this assertion
117
+ assert (
118
+ len(self.tokenizer._mergeable_ranks)
119
+ + len(self.tokenizer._special_tokens)
120
+ + 1
121
+ == self.tokenizer.n_vocab
122
+ ), f"{len(self.tokenizer._mergeable_ranks) + len(self.tokenizer._special_tokens)} != {self.tokenizer.n_vocab} in encoding"
123
 
124
  self.decoder = {i: n for n, i in self.tokenizer._mergeable_ranks.items()}
125
  self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})