rajammanabrolu
commited on
Commit
•
e1eee7f
1
Parent(s):
bc6270a
Update tiktoken.py
Browse files- tiktoken.py +29 -15
tiktoken.py
CHANGED
@@ -127,7 +127,7 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
|
127 |
self.byte_encoder = bytes_to_unicode()
|
128 |
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
129 |
|
130 |
-
self.decoder = {}
|
131 |
for i in range(self.encoding.n_vocab):
|
132 |
try:
|
133 |
self.encoding.decode_single_token_bytes(i)
|
@@ -141,7 +141,7 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
|
141 |
])
|
142 |
self.decoder[i] = decoding
|
143 |
|
144 |
-
self.encoder = {}
|
145 |
for i in range(self.encoding.n_vocab):
|
146 |
if i in self.decoder:
|
147 |
self.encoder[self.decoder[i]] = i
|
@@ -173,12 +173,30 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
|
173 |
Pinning default Chat ML template in case defaults change.
|
174 |
"""
|
175 |
template = (
|
176 |
-
"{%
|
177 |
-
'{%
|
178 |
-
"{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
'{% endif %}'
|
180 |
-
'{% for message in messages %}'
|
181 |
-
"{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
|
182 |
'{% endfor %}')
|
183 |
template = template.replace(
|
184 |
'USE_DEFAULT_PROMPT',
|
@@ -188,11 +206,7 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
|
188 |
return template
|
189 |
|
190 |
def get_vocab(self) -> Dict[str, int]:
|
191 |
-
"""Returns vocab as a dict.
|
192 |
-
|
193 |
-
Note: This function does not work properly due to difference in assumptions between tiktoken and Hugging Face tokenizers.
|
194 |
-
Most uses do not need to use get_vocab, so this is not a priority to fix.
|
195 |
-
"""
|
196 |
# As far as I can tell, we don't require get_vocab to completely work,
|
197 |
# but when using additional_special_tokens, Hugging Face determines the next
|
198 |
# token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
|
@@ -227,15 +241,15 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
|
227 |
|
228 |
return tokens
|
229 |
|
230 |
-
def _convert_token_to_id(self, token: str):
|
231 |
"""Converts a token (str) in an id using the vocab."""
|
232 |
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
233 |
|
234 |
-
def _convert_id_to_token(self, index: int):
|
235 |
"""Converts an index (integer) in a token (str) using the vocab."""
|
236 |
return self.decoder.get(index)
|
237 |
|
238 |
-
def convert_tokens_to_string(self, tokens: List[str]):
|
239 |
"""Converts a sequence of tokens (string) in a single string."""
|
240 |
text = ''.join(tokens)
|
241 |
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8')
|
|
|
127 |
self.byte_encoder = bytes_to_unicode()
|
128 |
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
129 |
|
130 |
+
self.decoder: Dict[int, str] = {}
|
131 |
for i in range(self.encoding.n_vocab):
|
132 |
try:
|
133 |
self.encoding.decode_single_token_bytes(i)
|
|
|
141 |
])
|
142 |
self.decoder[i] = decoding
|
143 |
|
144 |
+
self.encoder: Dict[str, int] = {}
|
145 |
for i in range(self.encoding.n_vocab):
|
146 |
if i in self.decoder:
|
147 |
self.encoder[self.decoder[i]] = i
|
|
|
173 |
Pinning default Chat ML template in case defaults change.
|
174 |
"""
|
175 |
template = (
|
176 |
+
"{% if messages[0]['role'] == 'system' %}"
|
177 |
+
'{% set loop_messages = messages[1:] %}'
|
178 |
+
"{% set system_message = messages[0]['content'] %}"
|
179 |
+
"{% elif USE_DEFAULT_PROMPT == true and not 'system' in messages[0]['role'] %}"
|
180 |
+
'{% set loop_messages = messages %}'
|
181 |
+
"{% set system_message = 'DEFAULT_SYSTEM_PROMPT' %}"
|
182 |
+
'{% else %}'
|
183 |
+
'{% set loop_messages = messages %}'
|
184 |
+
'{% set system_message = false %}'
|
185 |
+
'{% endif %}'
|
186 |
+
'{% for message in loop_messages %}'
|
187 |
+
'{% if loop.index0 == 0 %}'
|
188 |
+
'{% if system_message != false %}'
|
189 |
+
"{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}"
|
190 |
+
'{% endif %}'
|
191 |
+
"{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
|
192 |
+
'{% else %}'
|
193 |
+
"{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
|
194 |
+
'{% endif %}'
|
195 |
+
'{% if (add_generation_prompt == true) %}'
|
196 |
+
"{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
|
197 |
+
"{% elif (message['role'] == 'assistant') %}"
|
198 |
+
'{{ eos_token }}'
|
199 |
'{% endif %}'
|
|
|
|
|
200 |
'{% endfor %}')
|
201 |
template = template.replace(
|
202 |
'USE_DEFAULT_PROMPT',
|
|
|
206 |
return template
|
207 |
|
208 |
def get_vocab(self) -> Dict[str, int]:
|
209 |
+
"""Returns vocab as a dict."""
|
|
|
|
|
|
|
|
|
210 |
# As far as I can tell, we don't require get_vocab to completely work,
|
211 |
# but when using additional_special_tokens, Hugging Face determines the next
|
212 |
# token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
|
|
|
241 |
|
242 |
return tokens
|
243 |
|
244 |
+
def _convert_token_to_id(self, token: str) -> Optional[int]:
|
245 |
"""Converts a token (str) in an id using the vocab."""
|
246 |
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
247 |
|
248 |
+
def _convert_id_to_token(self, index: int) -> Optional[str]:
|
249 |
"""Converts an index (integer) in a token (str) using the vocab."""
|
250 |
return self.decoder.get(index)
|
251 |
|
252 |
+
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
253 |
"""Converts a sequence of tokens (string) in a single string."""
|
254 |
text = ''.join(tokens)
|
255 |
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8')
|