rajammanabrolu
commited on
Commit
•
cd5239e
1
Parent(s):
fd5557b
Update tiktoken.py
Browse files- tiktoken.py +6 -3
tiktoken.py
CHANGED
@@ -198,7 +198,7 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
|
198 |
'{% else %}'
|
199 |
"{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
|
200 |
'{% endif %}'
|
201 |
-
'{% if (add_generation_prompt == true) %}'
|
202 |
"{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
|
203 |
"{% elif (message['role'] == 'assistant') %}"
|
204 |
'{{ eos_token }}'
|
@@ -253,7 +253,10 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
|
253 |
|
254 |
def _convert_id_to_token(self, index: int) -> Optional[str]:
|
255 |
"""Converts an index (integer) in a token (str) using the vocab."""
|
256 |
-
|
|
|
|
|
|
|
257 |
|
258 |
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
259 |
"""Converts a sequence of tokens (string) in a single string."""
|
@@ -370,4 +373,4 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
|
370 |
return tensor
|
371 |
|
372 |
|
373 |
-
TiktokenTokenizerWrapper.register_for_auto_class()
|
|
|
198 |
'{% else %}'
|
199 |
"{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
|
200 |
'{% endif %}'
|
201 |
+
'{% if (add_generation_prompt == true and loop.last) %}'
|
202 |
"{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
|
203 |
"{% elif (message['role'] == 'assistant') %}"
|
204 |
'{{ eos_token }}'
|
|
|
253 |
|
254 |
def _convert_id_to_token(self, index: int) -> Optional[str]:
|
255 |
"""Converts an index (integer) in a token (str) using the vocab."""
|
256 |
+
# For tokens in either the gap in ids in the tokenizer, or beyond the range of the tokenizer,
|
257 |
+
# we return empty string. This matches the behavior of Hugging Face fast tokenizers,
|
258 |
+
# but not slow tokenizers.
|
259 |
+
return self.decoder.get(index, '')
|
260 |
|
261 |
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
262 |
"""Converts a sequence of tokens (string) in a single string."""
|
|
|
373 |
return tensor
|
374 |
|
375 |
|
376 |
+
TiktokenTokenizerWrapper.register_for_auto_class()
|